howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28 29from howard.functions.commons import * 30from howard.objects.database import * 31from howard.functions.databases import * 32from howard.functions.utils import * 33 34 35class Variants: 36 37 def __init__( 38 self, 39 conn=None, 40 input: str = None, 41 output: str = None, 42 config: dict = {}, 43 param: dict = {}, 44 load: bool = False, 45 ) -> None: 46 """ 47 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 48 header 49 50 :param conn: the connection to the database 51 :param input: the input file 52 :param output: the output file 53 :param config: a dictionary containing the configuration of the model 54 :param param: a dictionary containing the parameters of the model 55 """ 56 57 # Init variables 58 self.init_variables() 59 60 # Input 61 self.set_input(input) 62 63 # Config 64 self.set_config(config) 65 66 # Param 67 self.set_param(param) 68 69 # Output 70 self.set_output(output) 71 72 # connexion 73 self.set_connexion(conn) 74 75 # Header 76 self.set_header() 77 78 # Samples 79 self.set_samples() 80 81 # Load data 82 if load: 83 self.load_data() 84 85 def set_samples(self, samples: list = None) -> list: 86 """ 87 The function `set_samples` sets the samples attribute of an object to a provided list or 88 retrieves it from a parameter dictionary. 89 90 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 91 input and sets the `samples` attribute of the class to the provided list. If no samples are 92 provided, it tries to get the samples from the class's parameters using the `get_param` method 93 :type samples: list 94 :return: The `samples` list is being returned. 95 """ 96 97 if not samples: 98 samples = self.get_param().get("samples", {}).get("list", None) 99 100 self.samples = samples 101 102 return samples 103 104 def get_samples(self) -> list: 105 """ 106 This function returns a list of samples. 107 :return: The `get_samples` method is returning the `samples` attribute of the object. 108 """ 109 110 return self.samples 111 112 def get_samples_check(self) -> bool: 113 """ 114 This function returns the value of the "check" key within the "samples" dictionary retrieved 115 from the parameters. 116 :return: The method `get_samples_check` is returning the value of the key "check" inside the 117 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 118 method. If the key "check" is not found, it will return `False`. 119 """ 120 121 return self.get_param().get("samples", {}).get("check", True) 122 123 def set_input(self, input: str = None) -> None: 124 """ 125 The function `set_input` takes a file name as input, extracts the name and extension, and sets 126 attributes in the class accordingly. 127 128 :param input: The `set_input` method in the provided code snippet is used to set attributes 129 related to the input file. Here's a breakdown of the parameters and their usage in the method: 130 :type input: str 131 """ 132 133 if input and not isinstance(input, str): 134 try: 135 self.input = input.name 136 except: 137 log.error(f"Input file '{input} in bad format") 138 raise ValueError(f"Input file '{input} in bad format") 139 else: 140 self.input = input 141 142 # Input format 143 if input: 144 input_name, input_extension = os.path.splitext(self.input) 145 self.input_name = input_name 146 self.input_extension = input_extension 147 self.input_format = self.input_extension.replace(".", "") 148 149 def set_config(self, config: dict) -> None: 150 """ 151 The set_config function takes a config object and assigns it as the configuration object for the 152 class. 153 154 :param config: The `config` parameter in the `set_config` function is a dictionary object that 155 contains configuration settings for the class. When you call the `set_config` function with a 156 dictionary object as the argument, it will set that dictionary as the configuration object for 157 the class 158 :type config: dict 159 """ 160 161 self.config = config 162 163 def set_param(self, param: dict) -> None: 164 """ 165 This function sets a parameter object for the class based on the input dictionary. 166 167 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 168 as the `param` attribute of the class instance 169 :type param: dict 170 """ 171 172 self.param = param 173 174 def init_variables(self) -> None: 175 """ 176 This function initializes the variables that will be used in the rest of the class 177 """ 178 179 self.prefix = "howard" 180 self.table_variants = "variants" 181 self.dataframe = None 182 183 self.comparison_map = { 184 "gt": ">", 185 "gte": ">=", 186 "lt": "<", 187 "lte": "<=", 188 "equals": "=", 189 "contains": "SIMILAR TO", 190 } 191 192 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 193 194 self.code_type_map_to_sql = { 195 "Integer": "INTEGER", 196 "String": "VARCHAR", 197 "Float": "FLOAT", 198 "Flag": "VARCHAR", 199 } 200 201 self.index_additionnal_fields = [] 202 203 def get_indexing(self) -> bool: 204 """ 205 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 206 returns False. 207 :return: The value of the indexing parameter. 208 """ 209 210 return self.get_param().get("indexing", False) 211 212 def get_connexion_config(self) -> dict: 213 """ 214 The function `get_connexion_config` returns a dictionary containing the configuration for a 215 connection, including the number of threads and memory limit. 216 :return: a dictionary containing the configuration for the Connexion library. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # Connexion config 223 connexion_config = {} 224 threads = self.get_threads() 225 226 # Threads 227 if threads: 228 connexion_config["threads"] = threads 229 230 # Memory 231 # if config.get("memory", None): 232 # connexion_config["memory_limit"] = config.get("memory") 233 if self.get_memory(): 234 connexion_config["memory_limit"] = self.get_memory() 235 236 # Temporary directory 237 if config.get("tmp", None): 238 connexion_config["temp_directory"] = config.get("tmp") 239 240 # Access 241 if config.get("access", None): 242 access = config.get("access") 243 if access in ["RO"]: 244 access = "READ_ONLY" 245 elif access in ["RW"]: 246 access = "READ_WRITE" 247 connexion_db = self.get_connexion_db() 248 if connexion_db in ":memory:": 249 access = "READ_WRITE" 250 connexion_config["access_mode"] = access 251 252 return connexion_config 253 254 def get_duckdb_settings(self) -> dict: 255 """ 256 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 257 string. 258 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 259 """ 260 261 # config 262 config = self.get_config() 263 264 # duckdb settings 265 duckdb_settings_dict = {} 266 if config.get("duckdb_settings", None): 267 duckdb_settings = config.get("duckdb_settings") 268 duckdb_settings = full_path(duckdb_settings) 269 # duckdb setting is a file 270 if os.path.exists(duckdb_settings): 271 with open(duckdb_settings) as json_file: 272 duckdb_settings_dict = yaml.safe_load(json_file) 273 # duckdb settings is a string 274 else: 275 duckdb_settings_dict = json.loads(duckdb_settings) 276 277 return duckdb_settings_dict 278 279 def set_connexion_db(self) -> str: 280 """ 281 The function `set_connexion_db` returns the appropriate database connection string based on the 282 input format and connection type. 283 :return: the value of the variable `connexion_db`. 284 """ 285 286 # Default connexion db 287 default_connexion_db = ":memory:" 288 289 # Find connexion db 290 if self.get_input_format() in ["db", "duckdb"]: 291 connexion_db = self.get_input() 292 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 293 connexion_db = default_connexion_db 294 elif self.get_connexion_type() in ["tmpfile"]: 295 tmp_name = tempfile.mkdtemp( 296 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 297 ) 298 connexion_db = f"{tmp_name}/tmp.db" 299 elif self.get_connexion_type() != "": 300 connexion_db = self.get_connexion_type() 301 else: 302 connexion_db = default_connexion_db 303 304 # Set connexion db 305 self.connexion_db = connexion_db 306 307 return connexion_db 308 309 def set_connexion(self, conn) -> None: 310 """ 311 The function `set_connexion` creates a connection to a database, with options for different 312 database formats and settings. 313 314 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 315 database. If a connection is not provided, a new connection to an in-memory database is created. 316 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 317 sqlite 318 """ 319 320 # Connexion db 321 connexion_db = self.set_connexion_db() 322 323 # Connexion config 324 connexion_config = self.get_connexion_config() 325 326 # Connexion format 327 connexion_format = self.get_config().get("connexion_format", "duckdb") 328 # Set connexion format 329 self.connexion_format = connexion_format 330 331 # Connexion 332 if not conn: 333 if connexion_format in ["duckdb"]: 334 conn = duckdb.connect(connexion_db, config=connexion_config) 335 # duckDB settings 336 duckdb_settings = self.get_duckdb_settings() 337 if duckdb_settings: 338 for setting in duckdb_settings: 339 setting_value = duckdb_settings.get(setting) 340 if isinstance(setting_value, str): 341 setting_value = f"'{setting_value}'" 342 conn.execute(f"PRAGMA {setting}={setting_value};") 343 elif connexion_format in ["sqlite"]: 344 conn = sqlite3.connect(connexion_db) 345 346 # Set connexion 347 self.conn = conn 348 349 # Log 350 log.debug(f"connexion_format: {connexion_format}") 351 log.debug(f"connexion_db: {connexion_db}") 352 log.debug(f"connexion config: {connexion_config}") 353 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 354 355 def set_output(self, output: str = None) -> None: 356 """ 357 The `set_output` function in Python sets the output file based on the input or a specified key 358 in the config file, extracting the output name, extension, and format. 359 360 :param output: The `output` parameter in the `set_output` method is used to specify the name of 361 the output file. If the config file has an 'output' key, the method sets the output to the value 362 of that key. If no output is provided, it sets the output to `None` 363 :type output: str 364 """ 365 366 if output and not isinstance(output, str): 367 self.output = output.name 368 else: 369 self.output = output 370 371 # Output format 372 if self.output: 373 output_name, output_extension = os.path.splitext(self.output) 374 self.output_name = output_name 375 self.output_extension = output_extension 376 self.output_format = self.output_extension.replace(".", "") 377 else: 378 self.output_name = None 379 self.output_extension = None 380 self.output_format = None 381 382 def set_header(self) -> None: 383 """ 384 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 385 """ 386 387 input_file = self.get_input() 388 default_header_list = [ 389 "##fileformat=VCFv4.2", 390 "#CHROM POS ID REF ALT QUAL FILTER INFO", 391 ] 392 393 # Full path 394 input_file = full_path(input_file) 395 396 if input_file: 397 398 input_format = self.get_input_format() 399 input_compressed = self.get_input_compressed() 400 config = self.get_config() 401 header_list = default_header_list 402 if input_format in [ 403 "vcf", 404 "hdr", 405 "tsv", 406 "csv", 407 "psv", 408 "parquet", 409 "db", 410 "duckdb", 411 ]: 412 # header provided in param 413 if config.get("header_file", None): 414 with open(config.get("header_file"), "rt") as f: 415 header_list = self.read_vcf_header(f) 416 # within a vcf file format (header within input file itsself) 417 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 418 # within a compressed vcf file format (.vcf.gz) 419 if input_compressed: 420 with bgzf.open(input_file, "rt") as f: 421 header_list = self.read_vcf_header(f) 422 # within an uncompressed vcf file format (.vcf) 423 else: 424 with open(input_file, "rt") as f: 425 header_list = self.read_vcf_header(f) 426 # header provided in default external file .hdr 427 elif os.path.exists((input_file + ".hdr")): 428 with open(input_file + ".hdr", "rt") as f: 429 header_list = self.read_vcf_header(f) 430 else: 431 try: # Try to get header info fields and file columns 432 433 with tempfile.TemporaryDirectory() as tmpdir: 434 435 # Create database 436 db_for_header = Database(database=input_file) 437 438 # Get header columns for infos fields 439 db_header_from_columns = ( 440 db_for_header.get_header_from_columns() 441 ) 442 443 # Get real columns in the file 444 db_header_columns = db_for_header.get_columns() 445 446 # Write header file 447 header_file_tmp = os.path.join(tmpdir, "header") 448 f = open(header_file_tmp, "w") 449 vcf.Writer(f, db_header_from_columns) 450 f.close() 451 452 # Replace #CHROM line with rel columns 453 header_list = db_for_header.read_header_file( 454 header_file=header_file_tmp 455 ) 456 header_list[-1] = "\t".join(db_header_columns) 457 458 except: 459 460 log.warning( 461 f"No header for file {input_file}. Set as default VCF header" 462 ) 463 header_list = default_header_list 464 465 else: # try for unknown format ? 466 467 log.error(f"Input file format '{input_format}' not available") 468 raise ValueError(f"Input file format '{input_format}' not available") 469 470 if not header_list: 471 header_list = default_header_list 472 473 # header as list 474 self.header_list = header_list 475 476 # header as VCF object 477 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 478 479 else: 480 481 self.header_list = None 482 self.header_vcf = None 483 484 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 485 """ 486 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 487 DataFrame based on the connection format. 488 489 :param query: The `query` parameter in the `get_query_to_df` function is a string that 490 represents the SQL query you want to execute. This query will be used to fetch data from a 491 database and convert it into a pandas DataFrame 492 :type query: str 493 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 494 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 495 function will only fetch up to that number of rows from the database query result. If no limit 496 is specified, 497 :type limit: int 498 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 499 """ 500 501 # Connexion format 502 connexion_format = self.get_connexion_format() 503 504 # Limit in query 505 if limit: 506 pd.set_option("display.max_rows", limit) 507 if connexion_format in ["duckdb"]: 508 df = ( 509 self.conn.execute(query) 510 .fetch_record_batch(limit) 511 .read_next_batch() 512 .to_pandas() 513 ) 514 elif connexion_format in ["sqlite"]: 515 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 516 517 # Full query 518 else: 519 if connexion_format in ["duckdb"]: 520 df = self.conn.execute(query).df() 521 elif connexion_format in ["sqlite"]: 522 df = pd.read_sql_query(query, self.conn) 523 524 return df 525 526 def get_overview(self) -> None: 527 """ 528 The function prints the input, output, config, and dataframe of the current object 529 """ 530 table_variants_from = self.get_table_variants(clause="from") 531 sql_columns = self.get_header_columns_as_sql() 532 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 533 df = self.get_query_to_df(sql_query_export) 534 log.info( 535 "Input: " 536 + str(self.get_input()) 537 + " [" 538 + str(str(self.get_input_format())) 539 + "]" 540 ) 541 log.info( 542 "Output: " 543 + str(self.get_output()) 544 + " [" 545 + str(str(self.get_output_format())) 546 + "]" 547 ) 548 log.info("Config: ") 549 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 550 "\n" 551 ): 552 log.info("\t" + str(d)) 553 log.info("Param: ") 554 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 555 "\n" 556 ): 557 log.info("\t" + str(d)) 558 log.info("Sample list: " + str(self.get_header_sample_list())) 559 log.info("Dataframe: ") 560 for d in str(df).split("\n"): 561 log.info("\t" + str(d)) 562 563 # garbage collector 564 del df 565 gc.collect() 566 567 return None 568 569 def get_stats(self) -> dict: 570 """ 571 The `get_stats` function calculates and returns various statistics of the current object, 572 including information about the input file, variants, samples, header fields, quality, and 573 SNVs/InDels. 574 :return: a dictionary containing various statistics of the current object. The dictionary has 575 the following structure: 576 """ 577 578 # Log 579 log.info(f"Stats Calculation...") 580 581 # table varaints 582 table_variants_from = self.get_table_variants() 583 584 # stats dict 585 stats = {"Infos": {}} 586 587 ### File 588 input_file = self.get_input() 589 stats["Infos"]["Input file"] = input_file 590 591 # Header 592 header_infos = self.get_header().infos 593 header_formats = self.get_header().formats 594 header_infos_list = list(header_infos) 595 header_formats_list = list(header_formats) 596 597 ### Variants 598 599 stats["Variants"] = {} 600 601 # Variants by chr 602 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 603 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 604 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 605 by=["CHROM"], kind="quicksort" 606 ) 607 608 # Total number of variants 609 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 610 611 # Calculate percentage 612 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 613 lambda x: (x / nb_of_variants) 614 ) 615 616 stats["Variants"]["Number of variants by chromosome"] = ( 617 nb_of_variants_by_chrom.to_dict(orient="index") 618 ) 619 620 stats["Infos"]["Number of variants"] = int(nb_of_variants) 621 622 ### Samples 623 624 # Init 625 samples = {} 626 nb_of_samples = 0 627 628 # Check Samples 629 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 630 log.debug(f"Check samples...") 631 for sample in self.get_header_sample_list(): 632 sql_query_samples = f""" 633 SELECT '{sample}' as sample, 634 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 635 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 636 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 637 FROM {table_variants_from} 638 WHERE ( 639 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 640 AND 641 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 642 ) 643 GROUP BY genotype 644 """ 645 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 646 sample_genotype_count = sql_query_genotype_df["count"].sum() 647 if len(sql_query_genotype_df): 648 nb_of_samples += 1 649 samples[f"{sample} - {sample_genotype_count} variants"] = ( 650 sql_query_genotype_df.to_dict(orient="index") 651 ) 652 653 stats["Samples"] = samples 654 stats["Infos"]["Number of samples"] = nb_of_samples 655 656 # # 657 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 658 # stats["Infos"]["Number of samples"] = nb_of_samples 659 # elif nb_of_samples: 660 # stats["Infos"]["Number of samples"] = "not a VCF format" 661 662 ### INFO and FORMAT fields 663 header_types_df = {} 664 header_types_list = { 665 "List of INFO fields": header_infos, 666 "List of FORMAT fields": header_formats, 667 } 668 i = 0 669 for header_type in header_types_list: 670 671 header_type_infos = header_types_list.get(header_type) 672 header_infos_dict = {} 673 674 for info in header_type_infos: 675 676 i += 1 677 header_infos_dict[i] = {} 678 679 # ID 680 header_infos_dict[i]["id"] = info 681 682 # num 683 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 684 if header_type_infos[info].num in genotype_map.keys(): 685 header_infos_dict[i]["Number"] = genotype_map.get( 686 header_type_infos[info].num 687 ) 688 else: 689 header_infos_dict[i]["Number"] = header_type_infos[info].num 690 691 # type 692 if header_type_infos[info].type: 693 header_infos_dict[i]["Type"] = header_type_infos[info].type 694 else: 695 header_infos_dict[i]["Type"] = "." 696 697 # desc 698 if header_type_infos[info].desc != None: 699 header_infos_dict[i]["Description"] = header_type_infos[info].desc 700 else: 701 header_infos_dict[i]["Description"] = "" 702 703 if len(header_infos_dict): 704 header_types_df[header_type] = pd.DataFrame.from_dict( 705 header_infos_dict, orient="index" 706 ).to_dict(orient="index") 707 708 # Stats 709 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 710 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 711 stats["Header"] = header_types_df 712 713 ### QUAL 714 if "QUAL" in self.get_header_columns(): 715 sql_query_qual = f""" 716 SELECT 717 avg(CAST(QUAL AS INTEGER)) AS Average, 718 min(CAST(QUAL AS INTEGER)) AS Minimum, 719 max(CAST(QUAL AS INTEGER)) AS Maximum, 720 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 721 median(CAST(QUAL AS INTEGER)) AS Median, 722 variance(CAST(QUAL AS INTEGER)) AS Variance 723 FROM {table_variants_from} 724 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 725 """ 726 727 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 728 stats["Quality"] = {"Stats": qual} 729 730 ### SNV and InDel 731 732 sql_query_snv = f""" 733 734 SELECT Type, count FROM ( 735 736 SELECT 737 'Total' AS Type, 738 count(*) AS count 739 FROM {table_variants_from} 740 741 UNION 742 743 SELECT 744 'MNV' AS Type, 745 count(*) AS count 746 FROM {table_variants_from} 747 WHERE len(REF) > 1 AND len(ALT) > 1 748 AND len(REF) = len(ALT) 749 750 UNION 751 752 SELECT 753 'InDel' AS Type, 754 count(*) AS count 755 FROM {table_variants_from} 756 WHERE len(REF) > 1 OR len(ALT) > 1 757 AND len(REF) != len(ALT) 758 759 UNION 760 761 SELECT 762 'SNV' AS Type, 763 count(*) AS count 764 FROM {table_variants_from} 765 WHERE len(REF) = 1 AND len(ALT) = 1 766 767 ) 768 769 ORDER BY count DESC 770 771 """ 772 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 773 774 sql_query_snv_substitution = f""" 775 SELECT 776 concat(REF, '>', ALT) AS 'Substitution', 777 count(*) AS count 778 FROM {table_variants_from} 779 WHERE len(REF) = 1 AND len(ALT) = 1 780 GROUP BY REF, ALT 781 ORDER BY count(*) DESC 782 """ 783 snv_substitution = ( 784 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 785 ) 786 stats["Variants"]["Counts"] = snv_indel 787 stats["Variants"]["Substitutions"] = snv_substitution 788 789 return stats 790 791 def stats_to_file(self, file: str = None) -> str: 792 """ 793 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 794 into a JSON object, and writes the JSON object to the specified file. 795 796 :param file: The `file` parameter is a string that represents the file path where the JSON data 797 will be written 798 :type file: str 799 :return: the name of the file that was written to. 800 """ 801 802 # Get stats 803 stats = self.get_stats() 804 805 # Serializing json 806 json_object = json.dumps(stats, indent=4) 807 808 # Writing to sample.json 809 with open(file, "w") as outfile: 810 outfile.write(json_object) 811 812 return file 813 814 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 815 """ 816 The `print_stats` function generates a markdown file and prints the statistics contained in a 817 JSON file in a formatted manner. 818 819 :param output_file: The `output_file` parameter is a string that specifies the path and filename 820 of the output file where the stats will be printed in Markdown format. If no `output_file` is 821 provided, a temporary directory will be created and the stats will be saved in a file named 822 "stats.md" within that 823 :type output_file: str 824 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 825 file where the statistics will be saved. If no value is provided, a temporary directory will be 826 created and a default file name "stats.json" will be used 827 :type json_file: str 828 :return: The function `print_stats` does not return any value. It has a return type annotation 829 of `None`. 830 """ 831 832 # Full path 833 output_file = full_path(output_file) 834 json_file = full_path(json_file) 835 836 with tempfile.TemporaryDirectory() as tmpdir: 837 838 # Files 839 if not output_file: 840 output_file = os.path.join(tmpdir, "stats.md") 841 if not json_file: 842 json_file = os.path.join(tmpdir, "stats.json") 843 844 # Create folders 845 if not os.path.exists(os.path.dirname(output_file)): 846 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 847 if not os.path.exists(os.path.dirname(json_file)): 848 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 849 850 # Create stats JSON file 851 stats_file = self.stats_to_file(file=json_file) 852 853 # Print stats file 854 with open(stats_file) as f: 855 stats = yaml.safe_load(f) 856 857 # Output 858 output_title = [] 859 output_index = [] 860 output = [] 861 862 # Title 863 output_title.append("# HOWARD Stats") 864 865 # Index 866 output_index.append("## Index") 867 868 # Process sections 869 for section in stats: 870 infos = stats.get(section) 871 section_link = "#" + section.lower().replace(" ", "-") 872 output.append(f"## {section}") 873 output_index.append(f"- [{section}]({section_link})") 874 875 if len(infos): 876 for info in infos: 877 try: 878 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 879 is_df = True 880 except: 881 try: 882 df = pd.DataFrame.from_dict( 883 json.loads((infos.get(info))), orient="index" 884 ) 885 is_df = True 886 except: 887 is_df = False 888 if is_df: 889 output.append(f"### {info}") 890 info_link = "#" + info.lower().replace(" ", "-") 891 output_index.append(f" - [{info}]({info_link})") 892 output.append(f"{df.to_markdown(index=False)}") 893 else: 894 output.append(f"- {info}: {infos.get(info)}") 895 else: 896 output.append(f"NA") 897 898 # Write stats in markdown file 899 with open(output_file, "w") as fp: 900 for item in output_title: 901 fp.write("%s\n" % item) 902 for item in output_index: 903 fp.write("%s\n" % item) 904 for item in output: 905 fp.write("%s\n" % item) 906 907 # Output stats in markdown 908 print("") 909 print("\n\n".join(output_title)) 910 print("") 911 print("\n\n".join(output)) 912 print("") 913 914 return None 915 916 def get_input(self) -> str: 917 """ 918 It returns the value of the input variable. 919 :return: The input is being returned. 920 """ 921 return self.input 922 923 def get_input_format(self, input_file: str = None) -> str: 924 """ 925 This function returns the format of the input variable, either from the provided input file or 926 by prompting for input. 927 928 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 929 represents the file path of the input file. If no `input_file` is provided when calling the 930 method, it will default to `None` 931 :type input_file: str 932 :return: The format of the input variable is being returned. 933 """ 934 935 if not input_file: 936 input_file = self.get_input() 937 input_format = get_file_format(input_file) 938 return input_format 939 940 def get_input_compressed(self, input_file: str = None) -> str: 941 """ 942 The function `get_input_compressed` returns the format of the input variable after compressing 943 it. 944 945 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 946 that represents the file path of the input file. If no `input_file` is provided when calling the 947 method, it will default to `None` and the method will then call `self.get_input()` to 948 :type input_file: str 949 :return: The function `get_input_compressed` returns the compressed format of the input 950 variable. 951 """ 952 953 if not input_file: 954 input_file = self.get_input() 955 input_compressed = get_file_compressed(input_file) 956 return input_compressed 957 958 def get_output(self) -> str: 959 """ 960 It returns the output of the neuron. 961 :return: The output of the neural network. 962 """ 963 964 return self.output 965 966 def get_output_format(self, output_file: str = None) -> str: 967 """ 968 The function `get_output_format` returns the format of the input variable or the output file if 969 provided. 970 971 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 972 that represents the file path of the output file. If no `output_file` is provided when calling 973 the method, it will default to the output obtained from the `get_output` method of the class 974 instance. The 975 :type output_file: str 976 :return: The format of the input variable is being returned. 977 """ 978 979 if not output_file: 980 output_file = self.get_output() 981 output_format = get_file_format(output_file) 982 983 return output_format 984 985 def get_config(self) -> dict: 986 """ 987 It returns the config 988 :return: The config variable is being returned. 989 """ 990 return self.config 991 992 def get_param(self) -> dict: 993 """ 994 It returns the param 995 :return: The param variable is being returned. 996 """ 997 return self.param 998 999 def get_connexion_db(self) -> str: 1000 """ 1001 It returns the connexion_db attribute of the object 1002 :return: The connexion_db is being returned. 1003 """ 1004 return self.connexion_db 1005 1006 def get_prefix(self) -> str: 1007 """ 1008 It returns the prefix of the object. 1009 :return: The prefix is being returned. 1010 """ 1011 return self.prefix 1012 1013 def get_table_variants(self, clause: str = "select") -> str: 1014 """ 1015 This function returns the table_variants attribute of the object 1016 1017 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1018 defaults to select (optional) 1019 :return: The table_variants attribute of the object. 1020 """ 1021 1022 # Access 1023 access = self.get_config().get("access", None) 1024 1025 # Clauses "select", "where", "update" 1026 if clause in ["select", "where", "update"]: 1027 table_variants = self.table_variants 1028 # Clause "from" 1029 elif clause in ["from"]: 1030 # For Read Only 1031 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1032 input_file = self.get_input() 1033 table_variants = f"'{input_file}' as variants" 1034 # For Read Write 1035 else: 1036 table_variants = f"{self.table_variants} as variants" 1037 else: 1038 table_variants = self.table_variants 1039 return table_variants 1040 1041 def get_tmp_dir(self) -> str: 1042 """ 1043 The function `get_tmp_dir` returns the temporary directory path based on configuration 1044 parameters or a default path. 1045 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1046 configuration, parameters, and a default value of "/tmp". 1047 """ 1048 1049 return get_tmp( 1050 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1051 ) 1052 1053 def get_connexion_type(self) -> str: 1054 """ 1055 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1056 1057 :return: The connexion type is being returned. 1058 """ 1059 return self.get_config().get("connexion_type", "memory") 1060 1061 def get_connexion(self): 1062 """ 1063 It returns the connection object 1064 1065 :return: The connection object. 1066 """ 1067 return self.conn 1068 1069 def close_connexion(self) -> None: 1070 """ 1071 This function closes the connection to the database. 1072 :return: The connection is being closed. 1073 """ 1074 return self.conn.close() 1075 1076 def get_header(self, type: str = "vcf"): 1077 """ 1078 This function returns the header of the VCF file as a list of strings 1079 1080 :param type: the type of header you want to get, defaults to vcf (optional) 1081 :return: The header of the vcf file. 1082 """ 1083 1084 if self.header_vcf: 1085 if type == "vcf": 1086 return self.header_vcf 1087 elif type == "list": 1088 return self.header_list 1089 else: 1090 if type == "vcf": 1091 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1092 return header 1093 elif type == "list": 1094 return vcf_required 1095 1096 def get_header_infos_list(self) -> list: 1097 """ 1098 This function retrieves a list of information fields from the header. 1099 :return: A list of information fields from the header. 1100 """ 1101 1102 # Init 1103 infos_list = [] 1104 1105 for field in self.get_header().infos: 1106 infos_list.append(field) 1107 1108 return infos_list 1109 1110 def get_header_length(self, file: str = None) -> int: 1111 """ 1112 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1113 line. 1114 1115 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1116 header file. If this argument is provided, the function will read the header from the specified 1117 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1118 :type file: str 1119 :return: the length of the header list, excluding the #CHROM line. 1120 """ 1121 1122 if file: 1123 return len(self.read_vcf_header_file(file=file)) - 1 1124 elif self.get_header(type="list"): 1125 return len(self.get_header(type="list")) - 1 1126 else: 1127 return 0 1128 1129 def get_header_columns(self) -> str: 1130 """ 1131 This function returns the header list of a VCF 1132 1133 :return: The length of the header list. 1134 """ 1135 if self.get_header(): 1136 return self.get_header(type="list")[-1] 1137 else: 1138 return "" 1139 1140 def get_header_columns_as_list(self) -> list: 1141 """ 1142 This function returns the header list of a VCF 1143 1144 :return: The length of the header list. 1145 """ 1146 if self.get_header(): 1147 return self.get_header_columns().strip().split("\t") 1148 else: 1149 return [] 1150 1151 def get_header_columns_as_sql(self) -> str: 1152 """ 1153 This function retruns header length (without #CHROM line) 1154 1155 :return: The length of the header list. 1156 """ 1157 sql_column_list = [] 1158 for col in self.get_header_columns_as_list(): 1159 sql_column_list.append(f'"{col}"') 1160 return ",".join(sql_column_list) 1161 1162 def get_header_sample_list( 1163 self, check: bool = False, samples: list = None, samples_force: bool = False 1164 ) -> list: 1165 """ 1166 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1167 checking and filtering based on input parameters. 1168 1169 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1170 parameter that determines whether to check if the samples in the list are properly defined as 1171 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1172 list is defined as a, defaults to False 1173 :type check: bool (optional) 1174 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1175 allows you to specify a subset of samples from the header. If you provide a list of sample 1176 names, the function will check if each sample is defined in the header. If a sample is not found 1177 in the 1178 :type samples: list 1179 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1180 a boolean parameter that determines whether to force the function to return the sample list 1181 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1182 function will return the sample list without performing, defaults to False 1183 :type samples_force: bool (optional) 1184 :return: The function `get_header_sample_list` returns a list of samples based on the input 1185 parameters and conditions specified in the function. 1186 """ 1187 1188 # Init 1189 samples_list = [] 1190 1191 if samples is None: 1192 samples_list = self.header_vcf.samples 1193 else: 1194 samples_checked = [] 1195 for sample in samples: 1196 if sample in self.header_vcf.samples: 1197 samples_checked.append(sample) 1198 else: 1199 log.warning(f"Sample '{sample}' not defined in header") 1200 samples_list = samples_checked 1201 1202 # Force sample list without checking if is_genotype_column 1203 if samples_force: 1204 log.warning(f"Samples {samples_list} not checked if genotypes") 1205 return samples_list 1206 1207 if check: 1208 samples_checked = [] 1209 for sample in samples_list: 1210 if self.is_genotype_column(column=sample): 1211 samples_checked.append(sample) 1212 else: 1213 log.warning( 1214 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1215 ) 1216 samples_list = samples_checked 1217 1218 # Return samples list 1219 return samples_list 1220 1221 def is_genotype_column(self, column: str = None) -> bool: 1222 """ 1223 This function checks if a given column is a genotype column in a database. 1224 1225 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1226 represents the column name in a database table. This method checks if the specified column is a 1227 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1228 method of 1229 :type column: str 1230 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1231 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1232 column name and returns the result. If the `column` parameter is None, it returns False. 1233 """ 1234 1235 if column is not None: 1236 return Database(database=self.get_input()).is_genotype_column(column=column) 1237 else: 1238 return False 1239 1240 def get_verbose(self) -> bool: 1241 """ 1242 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1243 exist 1244 1245 :return: The value of the key "verbose" in the config dictionary. 1246 """ 1247 return self.get_config().get("verbose", False) 1248 1249 def get_connexion_format(self) -> str: 1250 """ 1251 It returns the connexion format of the object. 1252 :return: The connexion_format is being returned. 1253 """ 1254 connexion_format = self.connexion_format 1255 if connexion_format not in ["duckdb", "sqlite"]: 1256 log.error(f"Unknown connexion format {connexion_format}") 1257 raise ValueError(f"Unknown connexion format {connexion_format}") 1258 else: 1259 return connexion_format 1260 1261 def insert_file_to_table( 1262 self, 1263 file, 1264 columns: str, 1265 header_len: int = 0, 1266 sep: str = "\t", 1267 chunksize: int = 1000000, 1268 ) -> None: 1269 """ 1270 The function reads a file in chunks and inserts each chunk into a table based on the specified 1271 database format. 1272 1273 :param file: The `file` parameter is the file that you want to load into a table. It should be 1274 the path to the file on your system 1275 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1276 should contain the names of the columns in the table where the data will be inserted. The column 1277 names should be separated by commas within the string. For example, if you have columns named 1278 "id", "name 1279 :type columns: str 1280 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1281 the number of lines to skip at the beginning of the file before reading the actual data. This 1282 parameter allows you to skip any header information present in the file before processing the 1283 data, defaults to 0 1284 :type header_len: int (optional) 1285 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1286 separator character that is used in the file being read. In this case, the default separator is 1287 set to `\t`, which represents a tab character. You can change this parameter to a different 1288 separator character if, defaults to \t 1289 :type sep: str (optional) 1290 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1291 when processing the file in chunks. In the provided code snippet, the default value for 1292 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1293 to 1000000 1294 :type chunksize: int (optional) 1295 """ 1296 1297 # Config 1298 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1299 connexion_format = self.get_connexion_format() 1300 1301 log.debug("chunksize: " + str(chunksize)) 1302 1303 if chunksize: 1304 for chunk in pd.read_csv( 1305 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1306 ): 1307 if connexion_format in ["duckdb"]: 1308 sql_insert_into = ( 1309 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1310 ) 1311 self.conn.execute(sql_insert_into) 1312 elif connexion_format in ["sqlite"]: 1313 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1314 1315 def load_data( 1316 self, 1317 input_file: str = None, 1318 drop_variants_table: bool = False, 1319 sample_size: int = 20480, 1320 ) -> None: 1321 """ 1322 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1323 table before loading the data and specify a sample size. 1324 1325 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1326 table 1327 :type input_file: str 1328 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1329 determines whether the variants table should be dropped before loading the data. If set to 1330 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1331 not be dropped, defaults to False 1332 :type drop_variants_table: bool (optional) 1333 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1334 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1335 20480 1336 :type sample_size: int (optional) 1337 """ 1338 1339 log.info("Loading...") 1340 1341 # change input file 1342 if input_file: 1343 self.set_input(input_file) 1344 self.set_header() 1345 1346 # drop variants table 1347 if drop_variants_table: 1348 self.drop_variants_table() 1349 1350 # get table variants 1351 table_variants = self.get_table_variants() 1352 1353 # Access 1354 access = self.get_config().get("access", None) 1355 log.debug(f"access: {access}") 1356 1357 # Input format and compress 1358 input_format = self.get_input_format() 1359 input_compressed = self.get_input_compressed() 1360 log.debug(f"input_format: {input_format}") 1361 log.debug(f"input_compressed: {input_compressed}") 1362 1363 # input_compressed_format 1364 if input_compressed: 1365 input_compressed_format = "gzip" 1366 else: 1367 input_compressed_format = "none" 1368 log.debug(f"input_compressed_format: {input_compressed_format}") 1369 1370 # Connexion format 1371 connexion_format = self.get_connexion_format() 1372 1373 # Sample size 1374 if not sample_size: 1375 sample_size = -1 1376 log.debug(f"sample_size: {sample_size}") 1377 1378 # Load data 1379 log.debug(f"Load Data from {input_format}") 1380 1381 # DuckDB connexion 1382 if connexion_format in ["duckdb"]: 1383 1384 # Database already exists 1385 if self.input_format in ["db", "duckdb"]: 1386 1387 if connexion_format in ["duckdb"]: 1388 log.debug(f"Input file format '{self.input_format}' duckDB") 1389 else: 1390 log.error( 1391 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1392 ) 1393 raise ValueError( 1394 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1395 ) 1396 1397 # Load from existing database format 1398 else: 1399 1400 try: 1401 # Create Table or View 1402 database = Database(database=self.input) 1403 sql_from = database.get_sql_from(sample_size=sample_size) 1404 1405 if access in ["RO"]: 1406 sql_load = ( 1407 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1408 ) 1409 else: 1410 sql_load = ( 1411 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1412 ) 1413 self.conn.execute(sql_load) 1414 1415 except: 1416 # Format not available 1417 log.error(f"Input file format '{self.input_format}' not available") 1418 raise ValueError( 1419 f"Input file format '{self.input_format}' not available" 1420 ) 1421 1422 # SQLite connexion 1423 elif connexion_format in ["sqlite"] and input_format in [ 1424 "vcf", 1425 "tsv", 1426 "csv", 1427 "psv", 1428 ]: 1429 1430 # Main structure 1431 structure = { 1432 "#CHROM": "VARCHAR", 1433 "POS": "INTEGER", 1434 "ID": "VARCHAR", 1435 "REF": "VARCHAR", 1436 "ALT": "VARCHAR", 1437 "QUAL": "VARCHAR", 1438 "FILTER": "VARCHAR", 1439 "INFO": "VARCHAR", 1440 } 1441 1442 # Strcuture with samples 1443 structure_complete = structure 1444 if self.get_header_sample_list(): 1445 structure["FORMAT"] = "VARCHAR" 1446 for sample in self.get_header_sample_list(): 1447 structure_complete[sample] = "VARCHAR" 1448 1449 # Columns list for create and insert 1450 sql_create_table_columns = [] 1451 sql_create_table_columns_list = [] 1452 for column in structure_complete: 1453 column_type = structure_complete[column] 1454 sql_create_table_columns.append( 1455 f'"{column}" {column_type} default NULL' 1456 ) 1457 sql_create_table_columns_list.append(f'"{column}"') 1458 1459 # Create database 1460 log.debug(f"Create Table {table_variants}") 1461 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1462 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1463 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1464 self.conn.execute(sql_create_table) 1465 1466 # chunksize define length of file chunk load file 1467 chunksize = 100000 1468 1469 # delimiter 1470 delimiter = file_format_delimiters.get(input_format, "\t") 1471 1472 # Load the input file 1473 with open(self.input, "rt") as input_file: 1474 1475 # Use the appropriate file handler based on the input format 1476 if input_compressed: 1477 input_file = bgzf.open(self.input, "rt") 1478 if input_format in ["vcf"]: 1479 header_len = self.get_header_length() 1480 else: 1481 header_len = 0 1482 1483 # Insert the file contents into a table 1484 self.insert_file_to_table( 1485 input_file, 1486 columns=sql_create_table_columns_list_sql, 1487 header_len=header_len, 1488 sep=delimiter, 1489 chunksize=chunksize, 1490 ) 1491 1492 else: 1493 log.error( 1494 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1495 ) 1496 raise ValueError( 1497 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1498 ) 1499 1500 # Explode INFOS fields into table fields 1501 if self.get_explode_infos(): 1502 self.explode_infos( 1503 prefix=self.get_explode_infos_prefix(), 1504 fields=self.get_explode_infos_fields(), 1505 force=True, 1506 ) 1507 1508 # Create index after insertion 1509 self.create_indexes() 1510 1511 def get_explode_infos(self) -> bool: 1512 """ 1513 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1514 to False if it is not set. 1515 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1516 value. If the parameter is not present, it will return False. 1517 """ 1518 1519 return self.get_param().get("explode", {}).get("explode_infos", False) 1520 1521 def get_explode_infos_fields( 1522 self, 1523 explode_infos_fields: str = None, 1524 remove_fields_not_in_header: bool = False, 1525 ) -> list: 1526 """ 1527 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1528 the input parameter `explode_infos_fields`. 1529 1530 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1531 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1532 comma-separated list of field names to explode 1533 :type explode_infos_fields: str 1534 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1535 flag that determines whether to remove fields that are not present in the header. If it is set 1536 to `True`, any field that is not in the header will be excluded from the list of exploded 1537 information fields. If it is set to `, defaults to False 1538 :type remove_fields_not_in_header: bool (optional) 1539 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1540 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1541 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1542 Otherwise, it returns a list of exploded information fields after removing any spaces and 1543 splitting the string by commas. 1544 """ 1545 1546 # If no fields, get it in param 1547 if not explode_infos_fields: 1548 explode_infos_fields = ( 1549 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1550 ) 1551 1552 # If no fields, defined as all fields in header using keyword 1553 if not explode_infos_fields: 1554 explode_infos_fields = "*" 1555 1556 # If fields list not empty 1557 if explode_infos_fields: 1558 1559 # Input fields list 1560 if isinstance(explode_infos_fields, str): 1561 fields_input = explode_infos_fields.split(",") 1562 elif isinstance(explode_infos_fields, list): 1563 fields_input = explode_infos_fields 1564 else: 1565 fields_input = [] 1566 1567 # Fields list without * keyword 1568 fields_without_all = fields_input.copy() 1569 if "*".casefold() in (item.casefold() for item in fields_without_all): 1570 fields_without_all.remove("*") 1571 1572 # Fields in header 1573 fields_in_header = sorted(list(set(self.get_header().infos))) 1574 1575 # Construct list of fields 1576 fields_output = [] 1577 for field in fields_input: 1578 1579 # Strip field 1580 field = field.strip() 1581 1582 # format keyword * in regex 1583 if field.upper() in ["*"]: 1584 field = ".*" 1585 1586 # Find all fields with pattern 1587 r = re.compile(field) 1588 fields_search = sorted(list(filter(r.match, fields_in_header))) 1589 1590 # Remove fields input from search 1591 if field in fields_search: 1592 fields_search = [field] 1593 elif fields_search != [field]: 1594 fields_search = sorted( 1595 list(set(fields_search).difference(fields_input)) 1596 ) 1597 1598 # If field is not in header (avoid not well formatted header) 1599 if not fields_search and not remove_fields_not_in_header: 1600 fields_search = [field] 1601 1602 # Add found fields 1603 for new_field in fields_search: 1604 # Add field, if not already exists, and if it is in header (if asked) 1605 if ( 1606 new_field not in fields_output 1607 and ( 1608 not remove_fields_not_in_header 1609 or new_field in fields_in_header 1610 ) 1611 and new_field not in [".*"] 1612 ): 1613 fields_output.append(new_field) 1614 1615 return fields_output 1616 1617 else: 1618 1619 return [] 1620 1621 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1622 """ 1623 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1624 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1625 not provided. 1626 1627 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1628 prefix to be used for exploding or expanding information 1629 :type explode_infos_prefix: str 1630 :return: the value of the variable `explode_infos_prefix`. 1631 """ 1632 1633 if not explode_infos_prefix: 1634 explode_infos_prefix = ( 1635 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1636 ) 1637 1638 return explode_infos_prefix 1639 1640 def add_column( 1641 self, 1642 table_name, 1643 column_name, 1644 column_type, 1645 default_value=None, 1646 drop: bool = False, 1647 ) -> dict: 1648 """ 1649 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1650 doesn't already exist. 1651 1652 :param table_name: The name of the table to which you want to add a column 1653 :param column_name: The parameter "column_name" is the name of the column that you want to add 1654 to the table 1655 :param column_type: The `column_type` parameter specifies the data type of the column that you 1656 want to add to the table. It should be a string that represents the desired data type, such as 1657 "INTEGER", "TEXT", "REAL", etc 1658 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1659 default value for the newly added column. If a default value is provided, it will be assigned to 1660 the column for any existing rows that do not have a value for that column 1661 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1662 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1663 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1664 to False 1665 :type drop: bool (optional) 1666 :return: a boolean value indicating whether the column was successfully added to the table. 1667 """ 1668 1669 # added 1670 added = False 1671 dropped = False 1672 1673 # Check if the column already exists in the table 1674 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1675 columns = self.get_query_to_df(query).columns.tolist() 1676 if column_name.upper() in [c.upper() for c in columns]: 1677 log.debug( 1678 f"The {column_name} column already exists in the {table_name} table" 1679 ) 1680 if drop: 1681 self.drop_column(table_name=table_name, column_name=column_name) 1682 dropped = True 1683 else: 1684 return None 1685 else: 1686 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1687 1688 # Add column in table 1689 add_column_query = ( 1690 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1691 ) 1692 if default_value is not None: 1693 add_column_query += f" DEFAULT {default_value}" 1694 self.execute_query(add_column_query) 1695 added = not dropped 1696 log.debug( 1697 f"The {column_name} column was successfully added to the {table_name} table" 1698 ) 1699 1700 if added: 1701 added_column = { 1702 "table_name": table_name, 1703 "column_name": column_name, 1704 "column_type": column_type, 1705 "default_value": default_value, 1706 } 1707 else: 1708 added_column = None 1709 1710 return added_column 1711 1712 def drop_column( 1713 self, column: dict = None, table_name: str = None, column_name: str = None 1714 ) -> bool: 1715 """ 1716 The `drop_column` function drops a specified column from a given table in a database and returns 1717 True if the column was successfully dropped, and False if the column does not exist in the 1718 table. 1719 1720 :param column: The `column` parameter is a dictionary that contains information about the column 1721 you want to drop. It has two keys: 1722 :type column: dict 1723 :param table_name: The `table_name` parameter is the name of the table from which you want to 1724 drop a column 1725 :type table_name: str 1726 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1727 from the table 1728 :type column_name: str 1729 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1730 and False if the column does not exist in the table. 1731 """ 1732 1733 # Find column infos 1734 if column: 1735 if isinstance(column, dict): 1736 table_name = column.get("table_name", None) 1737 column_name = column.get("column_name", None) 1738 elif isinstance(column, str): 1739 table_name = self.get_table_variants() 1740 column_name = column 1741 else: 1742 table_name = None 1743 column_name = None 1744 1745 if not table_name and not column_name: 1746 return False 1747 1748 # Removed 1749 removed = False 1750 1751 # Check if the column already exists in the table 1752 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1753 columns = self.get_query_to_df(query).columns.tolist() 1754 if column_name in columns: 1755 log.debug(f"The {column_name} column exists in the {table_name} table") 1756 else: 1757 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1758 return False 1759 1760 # Add column in table # ALTER TABLE integers DROP k 1761 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1762 self.execute_query(add_column_query) 1763 removed = True 1764 log.debug( 1765 f"The {column_name} column was successfully dropped to the {table_name} table" 1766 ) 1767 1768 return removed 1769 1770 def explode_infos( 1771 self, 1772 prefix: str = None, 1773 create_index: bool = False, 1774 fields: list = None, 1775 force: bool = False, 1776 proccess_all_fields_together: bool = False, 1777 table: str = None, 1778 ) -> list: 1779 """ 1780 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1781 individual columns, returning a list of added columns. 1782 1783 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1784 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1785 `self.get_explode_infos_prefix()` as the prefix 1786 :type prefix: str 1787 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1788 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1789 `False`, indexes will not be created. The default value is `False`, defaults to False 1790 :type create_index: bool (optional) 1791 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1792 that you want to explode into individual columns. If this parameter is not provided, all INFO 1793 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1794 a list to the ` 1795 :type fields: list 1796 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1797 determines whether to drop and recreate a column if it already exists in the table. If `force` 1798 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1799 defaults to False 1800 :type force: bool (optional) 1801 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1802 flag that determines whether to process all the INFO fields together or individually. If set to 1803 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1804 be processed individually. The default value is, defaults to False 1805 :type proccess_all_fields_together: bool (optional) 1806 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1807 of the table where the exploded INFO fields will be added as individual columns. If you provide 1808 a value for the `table` parameter, the function will use that table name. If the `table` 1809 parameter is 1810 :type table: str 1811 :return: The `explode_infos` function returns a list of added columns. 1812 """ 1813 1814 # drop indexes 1815 self.drop_indexes() 1816 1817 # connexion format 1818 connexion_format = self.get_connexion_format() 1819 1820 # Access 1821 access = self.get_config().get("access", None) 1822 1823 # Added columns 1824 added_columns = [] 1825 1826 if access not in ["RO"]: 1827 1828 # prefix 1829 if prefix in [None, True] or not isinstance(prefix, str): 1830 if self.get_explode_infos_prefix() not in [None, True]: 1831 prefix = self.get_explode_infos_prefix() 1832 else: 1833 prefix = "INFO/" 1834 1835 # table variants 1836 if table is not None: 1837 table_variants = table 1838 else: 1839 table_variants = self.get_table_variants(clause="select") 1840 1841 # extra infos 1842 try: 1843 extra_infos = self.get_extra_infos() 1844 except: 1845 extra_infos = [] 1846 1847 # Header infos 1848 header_infos = self.get_header().infos 1849 1850 log.debug( 1851 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1852 ) 1853 1854 sql_info_alter_table_array = [] 1855 1856 # Info fields to check 1857 fields_list = list(header_infos) 1858 if fields: 1859 fields_list += fields 1860 fields_list = set(fields_list) 1861 1862 # If no fields 1863 if not fields: 1864 fields = [] 1865 1866 # Translate fields if patterns 1867 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1868 1869 for info in fields: 1870 1871 info_id_sql = prefix + info 1872 1873 if ( 1874 info in fields_list 1875 or prefix + info in fields_list 1876 or info in extra_infos 1877 ): 1878 1879 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1880 1881 if info in header_infos: 1882 info_type = header_infos[info].type 1883 info_num = header_infos[info].num 1884 else: 1885 info_type = "String" 1886 info_num = 0 1887 1888 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1889 if info_num != 1: 1890 type_sql = "VARCHAR" 1891 1892 # Add field 1893 added_column = self.add_column( 1894 table_name=table_variants, 1895 column_name=info_id_sql, 1896 column_type=type_sql, 1897 default_value="null", 1898 drop=force, 1899 ) 1900 1901 if added_column: 1902 added_columns.append(added_column) 1903 1904 if added_column or force: 1905 1906 # add field to index 1907 self.index_additionnal_fields.append(info_id_sql) 1908 1909 # Update field array 1910 if connexion_format in ["duckdb"]: 1911 update_info_field = f""" 1912 "{info_id_sql}" = 1913 CASE 1914 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1915 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1916 END 1917 """ 1918 elif connexion_format in ["sqlite"]: 1919 update_info_field = f""" 1920 "{info_id_sql}" = 1921 CASE 1922 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1923 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1924 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1925 END 1926 """ 1927 1928 sql_info_alter_table_array.append(update_info_field) 1929 1930 if sql_info_alter_table_array: 1931 1932 # By chromosomes 1933 try: 1934 chromosomes_list = list( 1935 self.get_query_to_df( 1936 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1937 )["#CHROM"] 1938 ) 1939 except: 1940 chromosomes_list = [None] 1941 1942 for chrom in chromosomes_list: 1943 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1944 1945 # Where clause 1946 where_clause = "" 1947 if chrom and len(chromosomes_list) > 1: 1948 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1949 1950 # Update table 1951 if proccess_all_fields_together: 1952 sql_info_alter_table_array_join = ", ".join( 1953 sql_info_alter_table_array 1954 ) 1955 if sql_info_alter_table_array_join: 1956 sql_info_alter_table = f""" 1957 UPDATE {table_variants} 1958 SET {sql_info_alter_table_array_join} 1959 {where_clause} 1960 """ 1961 log.debug( 1962 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1963 ) 1964 # log.debug(sql_info_alter_table) 1965 self.conn.execute(sql_info_alter_table) 1966 else: 1967 sql_info_alter_num = 0 1968 for sql_info_alter in sql_info_alter_table_array: 1969 sql_info_alter_num += 1 1970 sql_info_alter_table = f""" 1971 UPDATE {table_variants} 1972 SET {sql_info_alter} 1973 {where_clause} 1974 """ 1975 log.debug( 1976 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1977 ) 1978 # log.debug(sql_info_alter_table) 1979 self.conn.execute(sql_info_alter_table) 1980 1981 # create indexes 1982 if create_index: 1983 self.create_indexes() 1984 1985 return added_columns 1986 1987 def create_indexes(self) -> None: 1988 """ 1989 Create indexes on the table after insertion 1990 """ 1991 1992 # Access 1993 access = self.get_config().get("access", None) 1994 1995 # get table variants 1996 table_variants = self.get_table_variants("FROM") 1997 1998 if self.get_indexing() and access not in ["RO"]: 1999 # Create index 2000 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2001 self.conn.execute(sql_create_table_index) 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2009 self.conn.execute(sql_create_table_index) 2010 for field in self.index_additionnal_fields: 2011 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2012 self.conn.execute(sql_create_table_index) 2013 2014 def drop_indexes(self) -> None: 2015 """ 2016 Create indexes on the table after insertion 2017 """ 2018 2019 # Access 2020 access = self.get_config().get("access", None) 2021 2022 # get table variants 2023 table_variants = self.get_table_variants("FROM") 2024 2025 # Get database format 2026 connexion_format = self.get_connexion_format() 2027 2028 if access not in ["RO"]: 2029 if connexion_format in ["duckdb"]: 2030 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2031 elif connexion_format in ["sqlite"]: 2032 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2033 2034 list_indexes = self.conn.execute(sql_list_indexes) 2035 index_names = [row[0] for row in list_indexes.fetchall()] 2036 for index in index_names: 2037 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2038 self.conn.execute(sql_drop_table_index) 2039 2040 def read_vcf_header(self, f) -> list: 2041 """ 2042 It reads the header of a VCF file and returns a list of the header lines 2043 2044 :param f: the file object 2045 :return: The header lines of the VCF file. 2046 """ 2047 2048 header_list = [] 2049 for line in f: 2050 header_list.append(line) 2051 if line.startswith("#CHROM"): 2052 break 2053 return header_list 2054 2055 def read_vcf_header_file(self, file: str = None) -> list: 2056 """ 2057 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2058 uncompressed files. 2059 2060 :param file: The `file` parameter is a string that represents the path to the VCF header file 2061 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2062 default to `None` 2063 :type file: str 2064 :return: The function `read_vcf_header_file` returns a list. 2065 """ 2066 2067 if self.get_input_compressed(input_file=file): 2068 with bgzf.open(file, "rt") as f: 2069 return self.read_vcf_header(f=f) 2070 else: 2071 with open(file, "rt") as f: 2072 return self.read_vcf_header(f=f) 2073 2074 def execute_query(self, query: str): 2075 """ 2076 It takes a query as an argument, executes it, and returns the results 2077 2078 :param query: The query to be executed 2079 :return: The result of the query is being returned. 2080 """ 2081 if query: 2082 return self.conn.execute(query) # .fetchall() 2083 else: 2084 return None 2085 2086 def export_output( 2087 self, 2088 output_file: str | None = None, 2089 output_header: str | None = None, 2090 export_header: bool = True, 2091 query: str | None = None, 2092 parquet_partitions: list | None = None, 2093 chunk_size: int | None = None, 2094 threads: int | None = None, 2095 sort: bool = False, 2096 index: bool = False, 2097 order_by: str | None = None, 2098 fields_to_rename: dict | None = None 2099 ) -> bool: 2100 """ 2101 The `export_output` function exports data from a VCF file to various formats, including VCF, 2102 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2103 partitioning. 2104 2105 :param output_file: The `output_file` parameter is a string that specifies the name of the 2106 output file where the exported data will be saved 2107 :type output_file: str | None 2108 :param output_header: The `output_header` parameter is a string that specifies the name of the 2109 file where the header of the VCF file will be exported. If this parameter is not provided, the 2110 header will be exported to a file with the same name as the `output_file` parameter, but with 2111 the extension " 2112 :type output_header: str | None 2113 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2114 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2115 True, the header will be exported to a file. If `export_header` is False, the header will not 2116 be, defaults to True 2117 :type export_header: bool (optional) 2118 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2119 that can be used to filter and select specific data from the VCF file before exporting it. If 2120 provided, only the data that matches the query will be exported. This allows you to customize 2121 the exported data based on 2122 :type query: str | None 2123 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2124 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2125 organize data in a hierarchical directory structure based on the values of one or more columns. 2126 This can improve query performance when working with large datasets 2127 :type parquet_partitions: list | None 2128 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2129 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2130 multiple files. It helps in optimizing the export process by breaking down the data into 2131 manageable chunks for processing and storage 2132 :type chunk_size: int | None 2133 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2134 threads to be used during the export process. It determines the level of parallelism and can 2135 improve the performance of the export operation. If this parameter is not provided, the function 2136 will use the default number of threads 2137 :type threads: int | None 2138 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2139 determines whether the output file should be sorted based on genomic coordinates of the 2140 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2141 `False`,, defaults to False 2142 :type sort: bool (optional) 2143 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2144 determines whether an index should be created on the output file. If `index` is set to `True`, 2145 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2146 :type index: bool (optional) 2147 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2148 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2149 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2150 output file should be 2151 :type order_by: str | None 2152 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2153 mapping of field names to be renamed during the export process. This parameter allows you to 2154 customize the output field names before exporting the data. Each key-value pair in the 2155 dictionary represents the original field name as the key and the new field name 2156 :type fields_to_rename: dict | None 2157 :return: The `export_output` function returns a boolean value. It checks if the output file 2158 exists and returns True if it does, or None if it doesn't. 2159 """ 2160 2161 # Log 2162 log.info("Exporting...") 2163 2164 # Full path 2165 output_file = full_path(output_file) 2166 output_header = full_path(output_header) 2167 2168 # Config 2169 config = self.get_config() 2170 2171 # Param 2172 param = self.get_param() 2173 2174 # Tmp files to remove 2175 tmp_to_remove = [] 2176 2177 # If no output, get it 2178 if not output_file: 2179 output_file = self.get_output() 2180 2181 # If not threads 2182 if not threads: 2183 threads = self.get_threads() 2184 2185 # Rename fields 2186 if not fields_to_rename: 2187 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2188 self.rename_info_fields(fields_to_rename=fields_to_rename) 2189 2190 # Auto header name with extension 2191 if export_header or output_header: 2192 if not output_header: 2193 output_header = f"{output_file}.hdr" 2194 # Export header 2195 self.export_header(output_file=output_file) 2196 2197 # Switch off export header if VCF output 2198 output_file_type = get_file_format(output_file) 2199 if output_file_type in ["vcf"]: 2200 export_header = False 2201 tmp_to_remove.append(output_header) 2202 2203 # Chunk size 2204 if not chunk_size: 2205 chunk_size = config.get("chunk_size", None) 2206 2207 # Parquet partition 2208 if not parquet_partitions: 2209 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2210 if parquet_partitions and isinstance(parquet_partitions, str): 2211 parquet_partitions = parquet_partitions.split(",") 2212 2213 # Order by 2214 if not order_by: 2215 order_by = param.get("export", {}).get("order_by", "") 2216 2217 # Header in output 2218 header_in_output = param.get("export", {}).get("include_header", False) 2219 2220 # Database 2221 database_source = self.get_connexion() 2222 2223 # Connexion format 2224 connexion_format = self.get_connexion_format() 2225 2226 # Explode infos 2227 if self.get_explode_infos(): 2228 self.explode_infos( 2229 prefix=self.get_explode_infos_prefix(), 2230 fields=self.get_explode_infos_fields(), 2231 force=False, 2232 ) 2233 2234 # if connexion_format in ["sqlite"] or query: 2235 if connexion_format in ["sqlite"]: 2236 2237 # Export in Parquet 2238 random_tmp = "".join( 2239 random.choice(string.ascii_lowercase) for i in range(10) 2240 ) 2241 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2242 tmp_to_remove.append(database_source) 2243 2244 # Table Variants 2245 table_variants = self.get_table_variants() 2246 2247 # Create export query 2248 sql_query_export_subquery = f""" 2249 SELECT * FROM {table_variants} 2250 """ 2251 2252 # Write source file 2253 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2254 2255 # Create database 2256 database = Database( 2257 database=database_source, 2258 table="variants", 2259 header_file=output_header, 2260 conn_config=self.get_connexion_config(), 2261 ) 2262 2263 # Existing colomns header 2264 existing_columns_header = database.get_header_columns_from_database(query=query) 2265 2266 # Sample list 2267 if output_file_type in ["vcf"]: 2268 get_samples = self.get_samples() 2269 get_samples_check = self.get_samples_check() 2270 samples_force = get_samples is not None 2271 sample_list = self.get_header_sample_list( 2272 check=get_samples_check, 2273 samples=get_samples, 2274 samples_force=samples_force, 2275 ) 2276 else: 2277 sample_list = None 2278 2279 # Export file 2280 database.export( 2281 output_database=output_file, 2282 output_header=output_header, 2283 existing_columns_header=existing_columns_header, 2284 parquet_partitions=parquet_partitions, 2285 chunk_size=chunk_size, 2286 threads=threads, 2287 sort=sort, 2288 index=index, 2289 header_in_output=header_in_output, 2290 order_by=order_by, 2291 query=query, 2292 export_header=export_header, 2293 sample_list=sample_list, 2294 ) 2295 2296 # Remove 2297 remove_if_exists(tmp_to_remove) 2298 2299 return (os.path.exists(output_file) or None) and ( 2300 os.path.exists(output_file) or None 2301 ) 2302 2303 def get_extra_infos(self, table: str = None) -> list: 2304 """ 2305 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2306 in the header. 2307 2308 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2309 name of the table from which you want to retrieve the extra columns that are not present in the 2310 header. If the `table` parameter is not provided when calling the function, it will default to 2311 using the variants 2312 :type table: str 2313 :return: A list of columns that are in the specified table but not in the header of the table. 2314 """ 2315 2316 header_columns = [] 2317 2318 if not table: 2319 table = self.get_table_variants(clause="from") 2320 header_columns = self.get_header_columns() 2321 2322 # Check all columns in the database 2323 query = f""" SELECT * FROM {table} LIMIT 1 """ 2324 log.debug(f"query {query}") 2325 table_columns = self.get_query_to_df(query).columns.tolist() 2326 extra_columns = [] 2327 2328 # Construct extra infos (not in header) 2329 for column in table_columns: 2330 if column not in header_columns: 2331 extra_columns.append(column) 2332 2333 return extra_columns 2334 2335 def get_extra_infos_sql(self, table: str = None) -> str: 2336 """ 2337 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2338 by double quotes 2339 2340 :param table: The name of the table to get the extra infos from. If None, the default table is 2341 used 2342 :type table: str 2343 :return: A string of the extra infos 2344 """ 2345 2346 return ", ".join( 2347 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2348 ) 2349 2350 def export_header( 2351 self, 2352 header_name: str = None, 2353 output_file: str = None, 2354 output_file_ext: str = ".hdr", 2355 clean_header: bool = True, 2356 remove_chrom_line: bool = False, 2357 ) -> str: 2358 """ 2359 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2360 specified options, and writes it to a new file. 2361 2362 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2363 this parameter is not specified, the header will be written to the output file 2364 :type header_name: str 2365 :param output_file: The `output_file` parameter in the `export_header` function is used to 2366 specify the name of the output file where the header will be written. If this parameter is not 2367 provided, the header will be written to a temporary file 2368 :type output_file: str 2369 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2370 string that represents the extension of the output header file. By default, it is set to ".hdr" 2371 if not specified by the user. This extension will be appended to the `output_file` name to 2372 create the final, defaults to .hdr 2373 :type output_file_ext: str (optional) 2374 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2375 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2376 `True`, the function will clean the header by modifying certain lines based on a specific 2377 pattern. If `clean_header`, defaults to True 2378 :type clean_header: bool (optional) 2379 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2380 boolean flag that determines whether the #CHROM line should be removed from the header before 2381 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2382 defaults to False 2383 :type remove_chrom_line: bool (optional) 2384 :return: The function `export_header` returns the name of the temporary header file that is 2385 created. 2386 """ 2387 2388 if not header_name and not output_file: 2389 output_file = self.get_output() 2390 2391 if self.get_header(): 2392 2393 # Get header object 2394 header_obj = self.get_header() 2395 2396 # Create database 2397 db_for_header = Database(database=self.get_input()) 2398 2399 # Get real columns in the file 2400 db_header_columns = db_for_header.get_columns() 2401 2402 with tempfile.TemporaryDirectory() as tmpdir: 2403 2404 # Write header file 2405 header_file_tmp = os.path.join(tmpdir, "header") 2406 f = open(header_file_tmp, "w") 2407 vcf.Writer(f, header_obj) 2408 f.close() 2409 2410 # Replace #CHROM line with rel columns 2411 header_list = db_for_header.read_header_file( 2412 header_file=header_file_tmp 2413 ) 2414 header_list[-1] = "\t".join(db_header_columns) 2415 2416 # Remove CHROM line 2417 if remove_chrom_line: 2418 header_list.pop() 2419 2420 # Clean header 2421 if clean_header: 2422 header_list_clean = [] 2423 for head in header_list: 2424 # Clean head for malformed header 2425 head_clean = head 2426 head_clean = re.subn( 2427 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2428 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2429 head_clean, 2430 2, 2431 )[0] 2432 # Write header 2433 header_list_clean.append(head_clean) 2434 header_list = header_list_clean 2435 2436 tmp_header_name = output_file + output_file_ext 2437 2438 f = open(tmp_header_name, "w") 2439 for line in header_list: 2440 f.write(line) 2441 f.close() 2442 2443 return tmp_header_name 2444 2445 def export_variant_vcf( 2446 self, 2447 vcf_file, 2448 remove_info: bool = False, 2449 add_samples: bool = True, 2450 list_samples: list = [], 2451 where_clause: str = "", 2452 index: bool = False, 2453 threads: int | None = None, 2454 ) -> bool | None: 2455 """ 2456 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2457 remove INFO field, add samples, and control compression and indexing. 2458 2459 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2460 written to. It is the output file that will contain the filtered VCF data based on the specified 2461 parameters 2462 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2463 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2464 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2465 in, defaults to False 2466 :type remove_info: bool (optional) 2467 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2468 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2469 If set to False, the samples will be removed. The default value is True, defaults to True 2470 :type add_samples: bool (optional) 2471 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2472 in the output VCF file. By default, all samples will be included. If you provide a list of 2473 samples, only those samples will be included in the output file 2474 :type list_samples: list 2475 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2476 determines whether or not to create an index for the output VCF file. If `index` is set to 2477 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2478 :type index: bool (optional) 2479 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2480 number of threads to use for exporting the VCF file. It determines how many parallel threads 2481 will be used during the export process. More threads can potentially speed up the export process 2482 by utilizing multiple cores of the processor. If 2483 :type threads: int | None 2484 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2485 method with various parameters including the output file, query, threads, sort flag, and index 2486 flag. The `export_output` method is responsible for exporting the VCF data based on the 2487 specified parameters and configurations provided in the `export_variant_vcf` function. 2488 """ 2489 2490 # Config 2491 config = self.get_config() 2492 2493 # Extract VCF 2494 log.debug("Export VCF...") 2495 2496 # Table variants 2497 table_variants = self.get_table_variants() 2498 2499 # Threads 2500 if not threads: 2501 threads = self.get_threads() 2502 2503 # Info fields 2504 if remove_info: 2505 if not isinstance(remove_info, str): 2506 remove_info = "." 2507 info_field = f"""'{remove_info}' as INFO""" 2508 else: 2509 info_field = "INFO" 2510 2511 # Samples fields 2512 if add_samples: 2513 if not list_samples: 2514 list_samples = self.get_header_sample_list() 2515 if list_samples: 2516 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2517 else: 2518 samples_fields = "" 2519 log.debug(f"samples_fields: {samples_fields}") 2520 else: 2521 samples_fields = "" 2522 2523 # Where clause 2524 if where_clause is None: 2525 where_clause = "" 2526 2527 # Variants 2528 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2529 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2530 log.debug(f"sql_query_select={sql_query_select}") 2531 2532 return self.export_output( 2533 output_file=vcf_file, 2534 output_header=None, 2535 export_header=True, 2536 query=sql_query_select, 2537 parquet_partitions=None, 2538 chunk_size=config.get("chunk_size", None), 2539 threads=threads, 2540 sort=True, 2541 index=index, 2542 order_by=None, 2543 ) 2544 2545 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2546 """ 2547 It takes a list of commands and runs them in parallel using the number of threads specified 2548 2549 :param commands: A list of commands to run 2550 :param threads: The number of threads to use, defaults to 1 (optional) 2551 """ 2552 2553 run_parallel_commands(commands, threads) 2554 2555 def get_threads(self, default: int = 1) -> int: 2556 """ 2557 This function returns the number of threads to use for a job, with a default value of 1 if not 2558 specified. 2559 2560 :param default: The `default` parameter in the `get_threads` method is used to specify the 2561 default number of threads to use if no specific value is provided. If no value is provided for 2562 the `threads` parameter in the configuration or input parameters, the `default` value will be 2563 used, defaults to 1 2564 :type default: int (optional) 2565 :return: the number of threads to use for the current job. 2566 """ 2567 2568 # Config 2569 config = self.get_config() 2570 2571 # Param 2572 param = self.get_param() 2573 2574 # Input threads 2575 input_thread = param.get("threads", config.get("threads", None)) 2576 2577 # Check threads 2578 if not input_thread: 2579 threads = default 2580 elif int(input_thread) <= 0: 2581 threads = os.cpu_count() 2582 else: 2583 threads = int(input_thread) 2584 return threads 2585 2586 def get_memory(self, default: str = None) -> str: 2587 """ 2588 This function retrieves the memory value from parameters or configuration with a default value 2589 if not found. 2590 2591 :param default: The `get_memory` function takes in a default value as a string parameter. This 2592 default value is used as a fallback in case the `memory` parameter is not provided in the 2593 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2594 the function 2595 :type default: str 2596 :return: The `get_memory` function returns a string value representing the memory parameter. If 2597 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2598 return the default value provided as an argument to the function. 2599 """ 2600 2601 # Config 2602 config = self.get_config() 2603 2604 # Param 2605 param = self.get_param() 2606 2607 # Input threads 2608 input_memory = param.get("memory", config.get("memory", None)) 2609 2610 # Check threads 2611 if input_memory: 2612 memory = input_memory 2613 else: 2614 memory = default 2615 2616 return memory 2617 2618 def update_from_vcf(self, vcf_file: str) -> None: 2619 """ 2620 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2621 2622 :param vcf_file: the path to the VCF file 2623 """ 2624 2625 connexion_format = self.get_connexion_format() 2626 2627 if connexion_format in ["duckdb"]: 2628 self.update_from_vcf_duckdb(vcf_file) 2629 elif connexion_format in ["sqlite"]: 2630 self.update_from_vcf_sqlite(vcf_file) 2631 2632 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2633 """ 2634 It takes a VCF file and updates the INFO column of the variants table in the database with the 2635 INFO column of the VCF file 2636 2637 :param vcf_file: the path to the VCF file 2638 """ 2639 2640 # varaints table 2641 table_variants = self.get_table_variants() 2642 2643 # Loading VCF into temporaire table 2644 skip = self.get_header_length(file=vcf_file) 2645 vcf_df = pd.read_csv( 2646 vcf_file, 2647 sep="\t", 2648 engine="c", 2649 skiprows=skip, 2650 header=0, 2651 low_memory=False, 2652 ) 2653 sql_query_update = f""" 2654 UPDATE {table_variants} as table_variants 2655 SET INFO = concat( 2656 CASE 2657 WHEN INFO NOT IN ('', '.') 2658 THEN INFO 2659 ELSE '' 2660 END, 2661 ( 2662 SELECT 2663 concat( 2664 CASE 2665 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2666 THEN ';' 2667 ELSE '' 2668 END 2669 , 2670 CASE 2671 WHEN table_parquet.INFO NOT IN ('','.') 2672 THEN table_parquet.INFO 2673 ELSE '' 2674 END 2675 ) 2676 FROM vcf_df as table_parquet 2677 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2678 AND table_parquet.\"POS\" = table_variants.\"POS\" 2679 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2680 AND table_parquet.\"REF\" = table_variants.\"REF\" 2681 AND table_parquet.INFO NOT IN ('','.') 2682 ) 2683 ) 2684 ; 2685 """ 2686 self.conn.execute(sql_query_update) 2687 2688 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2689 """ 2690 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2691 table, then updates the INFO column of the variants table with the INFO column of the temporary 2692 table 2693 2694 :param vcf_file: The path to the VCF file you want to update the database with 2695 """ 2696 2697 # Create a temporary table for the VCF 2698 table_vcf = "tmp_vcf" 2699 sql_create = ( 2700 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2701 ) 2702 self.conn.execute(sql_create) 2703 2704 # Loading VCF into temporaire table 2705 vcf_df = pd.read_csv( 2706 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2707 ) 2708 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2709 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2710 2711 # Update table 'variants' with VCF data 2712 # warning: CONCAT as || operator 2713 sql_query_update = f""" 2714 UPDATE variants as table_variants 2715 SET INFO = CASE 2716 WHEN INFO NOT IN ('', '.') 2717 THEN INFO 2718 ELSE '' 2719 END || 2720 ( 2721 SELECT 2722 CASE 2723 WHEN table_variants.INFO NOT IN ('','.') 2724 AND table_vcf.INFO NOT IN ('','.') 2725 THEN ';' 2726 ELSE '' 2727 END || 2728 CASE 2729 WHEN table_vcf.INFO NOT IN ('','.') 2730 THEN table_vcf.INFO 2731 ELSE '' 2732 END 2733 FROM {table_vcf} as table_vcf 2734 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2735 AND table_vcf.\"POS\" = table_variants.\"POS\" 2736 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2737 AND table_vcf.\"REF\" = table_variants.\"REF\" 2738 ) 2739 """ 2740 self.conn.execute(sql_query_update) 2741 2742 # Drop temporary table 2743 sql_drop = f"DROP TABLE {table_vcf}" 2744 self.conn.execute(sql_drop) 2745 2746 def drop_variants_table(self) -> None: 2747 """ 2748 > This function drops the variants table 2749 """ 2750 2751 table_variants = self.get_table_variants() 2752 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2753 self.conn.execute(sql_table_variants) 2754 2755 def set_variant_id( 2756 self, variant_id_column: str = "variant_id", force: bool = None 2757 ) -> str: 2758 """ 2759 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2760 `#CHROM`, `POS`, `REF`, and `ALT` columns 2761 2762 :param variant_id_column: The name of the column to be created in the variants table, defaults 2763 to variant_id 2764 :type variant_id_column: str (optional) 2765 :param force: If True, the variant_id column will be created even if it already exists 2766 :type force: bool 2767 :return: The name of the column that contains the variant_id 2768 """ 2769 2770 # Assembly 2771 assembly = self.get_param().get( 2772 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2773 ) 2774 2775 # INFO/Tag prefix 2776 prefix = self.get_explode_infos_prefix() 2777 2778 # Explode INFO/SVTYPE 2779 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2780 2781 # variants table 2782 table_variants = self.get_table_variants() 2783 2784 # variant_id column 2785 if not variant_id_column: 2786 variant_id_column = "variant_id" 2787 2788 # Creta variant_id column 2789 if "variant_id" not in self.get_extra_infos() or force: 2790 2791 # Create column 2792 self.add_column( 2793 table_name=table_variants, 2794 column_name=variant_id_column, 2795 column_type="UBIGINT", 2796 default_value="0", 2797 ) 2798 2799 # Update column 2800 self.conn.execute( 2801 f""" 2802 UPDATE {table_variants} 2803 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2804 """ 2805 ) 2806 2807 # Remove added columns 2808 for added_column in added_columns: 2809 self.drop_column(column=added_column) 2810 2811 # return variant_id column name 2812 return variant_id_column 2813 2814 def get_variant_id_column( 2815 self, variant_id_column: str = "variant_id", force: bool = None 2816 ) -> str: 2817 """ 2818 This function returns the variant_id column name 2819 2820 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2821 defaults to variant_id 2822 :type variant_id_column: str (optional) 2823 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2824 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2825 if it is not already set, or if it is set 2826 :type force: bool 2827 :return: The variant_id column name. 2828 """ 2829 2830 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2831 2832 ### 2833 # Annotation 2834 ### 2835 2836 def scan_databases( 2837 self, 2838 database_formats: list = ["parquet"], 2839 database_releases: list = ["current"], 2840 ) -> dict: 2841 """ 2842 The function `scan_databases` scans for available databases based on specified formats and 2843 releases. 2844 2845 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2846 of the databases to be scanned. In this case, the accepted format is "parquet" 2847 :type database_formats: list ["parquet"] 2848 :param database_releases: The `database_releases` parameter is a list that specifies the 2849 releases of the databases to be scanned. In the provided function, the default value for 2850 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2851 databases that are in the "current" 2852 :type database_releases: list 2853 :return: The function `scan_databases` returns a dictionary containing information about 2854 databases that match the specified formats and releases. 2855 """ 2856 2857 # Config 2858 config = self.get_config() 2859 2860 # Param 2861 param = self.get_param() 2862 2863 # Param - Assembly 2864 assembly = param.get("assembly", config.get("assembly", None)) 2865 if not assembly: 2866 assembly = DEFAULT_ASSEMBLY 2867 log.warning(f"Default assembly '{assembly}'") 2868 2869 # Scan for availabled databases 2870 log.info( 2871 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2872 ) 2873 databases_infos_dict = databases_infos( 2874 database_folder_releases=database_releases, 2875 database_formats=database_formats, 2876 assembly=assembly, 2877 config=config, 2878 ) 2879 log.info( 2880 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2881 ) 2882 2883 return databases_infos_dict 2884 2885 def annotation(self) -> None: 2886 """ 2887 It annotates the VCF file with the annotations specified in the config file. 2888 """ 2889 2890 # Config 2891 config = self.get_config() 2892 2893 # Param 2894 param = self.get_param() 2895 2896 # Param - Assembly 2897 assembly = param.get("assembly", config.get("assembly", None)) 2898 if not assembly: 2899 assembly = DEFAULT_ASSEMBLY 2900 log.warning(f"Default assembly '{assembly}'") 2901 2902 # annotations databases folders 2903 annotations_databases = set( 2904 config.get("folders", {}) 2905 .get("databases", {}) 2906 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2907 + config.get("folders", {}) 2908 .get("databases", {}) 2909 .get("parquet", ["~/howard/databases/parquet/current"]) 2910 + config.get("folders", {}) 2911 .get("databases", {}) 2912 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2913 ) 2914 2915 # Get param annotations 2916 if param.get("annotations", None) and isinstance( 2917 param.get("annotations", None), str 2918 ): 2919 log.debug(param.get("annotations", None)) 2920 param_annotation_list = param.get("annotations").split(",") 2921 else: 2922 param_annotation_list = [] 2923 2924 # Each tools param 2925 if param.get("annotation_parquet", None) != None: 2926 log.debug( 2927 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2928 ) 2929 if isinstance(param.get("annotation_parquet", None), list): 2930 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2931 else: 2932 param_annotation_list.append(param.get("annotation_parquet")) 2933 if param.get("annotation_snpsift", None) != None: 2934 if isinstance(param.get("annotation_snpsift", None), list): 2935 param_annotation_list.append( 2936 "snpsift:" 2937 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2938 ) 2939 else: 2940 param_annotation_list.append( 2941 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2942 ) 2943 if param.get("annotation_snpeff", None) != None: 2944 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2945 if param.get("annotation_bcftools", None) != None: 2946 if isinstance(param.get("annotation_bcftools", None), list): 2947 param_annotation_list.append( 2948 "bcftools:" 2949 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2950 ) 2951 else: 2952 param_annotation_list.append( 2953 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2954 ) 2955 if param.get("annotation_annovar", None) != None: 2956 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2957 if param.get("annotation_exomiser", None) != None: 2958 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2959 if param.get("annotation_splice", None) != None: 2960 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2961 2962 # Merge param annotations list 2963 param["annotations"] = ",".join(param_annotation_list) 2964 2965 # debug 2966 log.debug(f"param_annotations={param['annotations']}") 2967 2968 if param.get("annotations"): 2969 2970 # Log 2971 # log.info("Annotations - Check annotation parameters") 2972 2973 if not "annotation" in param: 2974 param["annotation"] = {} 2975 2976 # List of annotations parameters 2977 annotations_list_input = {} 2978 if isinstance(param.get("annotations", None), str): 2979 annotation_file_list = [ 2980 value for value in param.get("annotations", "").split(",") 2981 ] 2982 for annotation_file in annotation_file_list: 2983 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2984 else: 2985 annotations_list_input = param.get("annotations", {}) 2986 2987 log.info(f"Quick Annotations:") 2988 for annotation_key in list(annotations_list_input.keys()): 2989 log.info(f" {annotation_key}") 2990 2991 # List of annotations and associated fields 2992 annotations_list = {} 2993 2994 for annotation_file in annotations_list_input: 2995 2996 # Explode annotations if ALL 2997 if ( 2998 annotation_file.upper() == "ALL" 2999 or annotation_file.upper().startswith("ALL:") 3000 ): 3001 3002 # check ALL parameters (formats, releases) 3003 annotation_file_split = annotation_file.split(":") 3004 database_formats = "parquet" 3005 database_releases = "current" 3006 for annotation_file_option in annotation_file_split[1:]: 3007 database_all_options_split = annotation_file_option.split("=") 3008 if database_all_options_split[0] == "format": 3009 database_formats = database_all_options_split[1].split("+") 3010 if database_all_options_split[0] == "release": 3011 database_releases = database_all_options_split[1].split("+") 3012 3013 # Scan for availabled databases 3014 databases_infos_dict = self.scan_databases( 3015 database_formats=database_formats, 3016 database_releases=database_releases, 3017 ) 3018 3019 # Add found databases in annotation parameters 3020 for database_infos in databases_infos_dict.keys(): 3021 annotations_list[database_infos] = {"INFO": None} 3022 3023 else: 3024 annotations_list[annotation_file] = annotations_list_input[ 3025 annotation_file 3026 ] 3027 3028 # Check each databases 3029 if len(annotations_list): 3030 3031 log.info( 3032 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3033 ) 3034 3035 for annotation_file in annotations_list: 3036 3037 # Init 3038 annotations = annotations_list.get(annotation_file, None) 3039 3040 # Annotation snpEff 3041 if annotation_file.startswith("snpeff"): 3042 3043 log.debug(f"Quick Annotation snpEff") 3044 3045 if "snpeff" not in param["annotation"]: 3046 param["annotation"]["snpeff"] = {} 3047 3048 if "options" not in param["annotation"]["snpeff"]: 3049 param["annotation"]["snpeff"]["options"] = "" 3050 3051 # snpEff options in annotations 3052 param["annotation"]["snpeff"]["options"] = "".join( 3053 annotation_file.split(":")[1:] 3054 ) 3055 3056 # Annotation Annovar 3057 elif annotation_file.startswith("annovar"): 3058 3059 log.debug(f"Quick Annotation Annovar") 3060 3061 if "annovar" not in param["annotation"]: 3062 param["annotation"]["annovar"] = {} 3063 3064 if "annotations" not in param["annotation"]["annovar"]: 3065 param["annotation"]["annovar"]["annotations"] = {} 3066 3067 # Options 3068 annotation_file_split = annotation_file.split(":") 3069 for annotation_file_annotation in annotation_file_split[1:]: 3070 if annotation_file_annotation: 3071 param["annotation"]["annovar"]["annotations"][ 3072 annotation_file_annotation 3073 ] = annotations 3074 3075 # Annotation Exomiser 3076 elif annotation_file.startswith("exomiser"): 3077 3078 log.debug(f"Quick Annotation Exomiser") 3079 3080 param["annotation"]["exomiser"] = params_string_to_dict( 3081 annotation_file 3082 ) 3083 3084 # Annotation Splice 3085 elif annotation_file.startswith("splice"): 3086 3087 log.debug(f"Quick Annotation Splice") 3088 3089 param["annotation"]["splice"] = params_string_to_dict( 3090 annotation_file 3091 ) 3092 3093 # Annotation Parquet or BCFTOOLS 3094 else: 3095 3096 # Tools detection 3097 if annotation_file.startswith("bcftools:"): 3098 annotation_tool_initial = "bcftools" 3099 annotation_file = ":".join(annotation_file.split(":")[1:]) 3100 elif annotation_file.startswith("snpsift:"): 3101 annotation_tool_initial = "snpsift" 3102 annotation_file = ":".join(annotation_file.split(":")[1:]) 3103 elif annotation_file.startswith("bigwig:"): 3104 annotation_tool_initial = "bigwig" 3105 annotation_file = ":".join(annotation_file.split(":")[1:]) 3106 else: 3107 annotation_tool_initial = None 3108 3109 # list of files 3110 annotation_file_list = annotation_file.replace("+", ":").split( 3111 ":" 3112 ) 3113 3114 for annotation_file in annotation_file_list: 3115 3116 if annotation_file: 3117 3118 # Annotation tool initial 3119 annotation_tool = annotation_tool_initial 3120 3121 # Find file 3122 annotation_file_found = None 3123 3124 if os.path.exists(annotation_file): 3125 annotation_file_found = annotation_file 3126 elif os.path.exists(full_path(annotation_file)): 3127 annotation_file_found = full_path(annotation_file) 3128 else: 3129 # Find within assembly folders 3130 for annotations_database in annotations_databases: 3131 found_files = find_all( 3132 annotation_file, 3133 os.path.join( 3134 annotations_database, assembly 3135 ), 3136 ) 3137 if len(found_files) > 0: 3138 annotation_file_found = found_files[0] 3139 break 3140 if not annotation_file_found and not assembly: 3141 # Find within folders 3142 for ( 3143 annotations_database 3144 ) in annotations_databases: 3145 found_files = find_all( 3146 annotation_file, annotations_database 3147 ) 3148 if len(found_files) > 0: 3149 annotation_file_found = found_files[0] 3150 break 3151 log.debug( 3152 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3153 ) 3154 3155 # Full path 3156 annotation_file_found = full_path(annotation_file_found) 3157 3158 if annotation_file_found: 3159 3160 database = Database(database=annotation_file_found) 3161 quick_annotation_format = database.get_format() 3162 quick_annotation_is_compressed = ( 3163 database.is_compressed() 3164 ) 3165 quick_annotation_is_indexed = os.path.exists( 3166 f"{annotation_file_found}.tbi" 3167 ) 3168 bcftools_preference = False 3169 3170 # Check Annotation Tool 3171 if not annotation_tool: 3172 if ( 3173 bcftools_preference 3174 and quick_annotation_format 3175 in ["vcf", "bed"] 3176 and quick_annotation_is_compressed 3177 and quick_annotation_is_indexed 3178 ): 3179 annotation_tool = "bcftools" 3180 elif quick_annotation_format in [ 3181 "vcf", 3182 "bed", 3183 "tsv", 3184 "tsv", 3185 "csv", 3186 "json", 3187 "tbl", 3188 "parquet", 3189 "duckdb", 3190 ]: 3191 annotation_tool = "parquet" 3192 elif quick_annotation_format in ["bw"]: 3193 annotation_tool = "bigwig" 3194 else: 3195 log.error( 3196 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3197 ) 3198 raise ValueError( 3199 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3200 ) 3201 3202 log.debug( 3203 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3204 ) 3205 3206 # Annotation Tool dispatch 3207 if annotation_tool: 3208 if annotation_tool not in param["annotation"]: 3209 param["annotation"][annotation_tool] = {} 3210 if ( 3211 "annotations" 3212 not in param["annotation"][annotation_tool] 3213 ): 3214 param["annotation"][annotation_tool][ 3215 "annotations" 3216 ] = {} 3217 param["annotation"][annotation_tool][ 3218 "annotations" 3219 ][annotation_file_found] = annotations 3220 3221 else: 3222 log.warning( 3223 f"Quick Annotation File {annotation_file} does NOT exist" 3224 ) 3225 3226 self.set_param(param) 3227 3228 if param.get("annotation", None): 3229 log.info("Annotations") 3230 if param.get("annotation", {}).get("parquet", None): 3231 log.info("Annotations 'parquet'...") 3232 self.annotation_parquet() 3233 if param.get("annotation", {}).get("bcftools", None): 3234 log.info("Annotations 'bcftools'...") 3235 self.annotation_bcftools() 3236 if param.get("annotation", {}).get("snpsift", None): 3237 log.info("Annotations 'snpsift'...") 3238 self.annotation_snpsift() 3239 if param.get("annotation", {}).get("bigwig", None): 3240 log.info("Annotations 'bigwig'...") 3241 self.annotation_bigwig() 3242 if param.get("annotation", {}).get("annovar", None): 3243 log.info("Annotations 'annovar'...") 3244 self.annotation_annovar() 3245 if param.get("annotation", {}).get("snpeff", None): 3246 log.info("Annotations 'snpeff'...") 3247 self.annotation_snpeff() 3248 if param.get("annotation", {}).get("exomiser", None) is not None: 3249 log.info("Annotations 'exomiser'...") 3250 self.annotation_exomiser() 3251 if param.get("annotation", {}).get("splice", None) is not None: 3252 log.info("Annotations 'splice' ...") 3253 self.annotation_splice() 3254 3255 # Explode INFOS fields into table fields 3256 if self.get_explode_infos(): 3257 self.explode_infos( 3258 prefix=self.get_explode_infos_prefix(), 3259 fields=self.get_explode_infos_fields(), 3260 force=True, 3261 ) 3262 3263 def annotation_bigwig(self, threads: int = None) -> None: 3264 """ 3265 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3266 3267 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3268 number of threads to be used for parallel processing during the annotation process. If the 3269 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3270 threads to use based on the system configuration 3271 :type threads: int 3272 :return: True 3273 """ 3274 3275 # DEBUG 3276 log.debug("Start annotation with bigwig databases") 3277 3278 # # Threads 3279 # if not threads: 3280 # threads = self.get_threads() 3281 # log.debug("Threads: " + str(threads)) 3282 3283 # Config 3284 config = self.get_config() 3285 log.debug("Config: " + str(config)) 3286 3287 # Config - BCFTools databases folders 3288 databases_folders = set( 3289 self.get_config() 3290 .get("folders", {}) 3291 .get("databases", {}) 3292 .get("annotations", ["."]) 3293 + self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("bigwig", ["."]) 3297 ) 3298 log.debug("Databases annotations: " + str(databases_folders)) 3299 3300 # Param 3301 annotations = ( 3302 self.get_param() 3303 .get("annotation", {}) 3304 .get("bigwig", {}) 3305 .get("annotations", None) 3306 ) 3307 log.debug("Annotations: " + str(annotations)) 3308 3309 # Assembly 3310 assembly = self.get_param().get( 3311 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3312 ) 3313 3314 # Data 3315 table_variants = self.get_table_variants() 3316 3317 # Check if not empty 3318 log.debug("Check if not empty") 3319 sql_query_chromosomes = ( 3320 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3321 ) 3322 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3323 if not sql_query_chromosomes_df["count"][0]: 3324 log.info(f"VCF empty") 3325 return 3326 3327 # VCF header 3328 vcf_reader = self.get_header() 3329 log.debug("Initial header: " + str(vcf_reader.infos)) 3330 3331 # Existing annotations 3332 for vcf_annotation in self.get_header().infos: 3333 3334 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3335 log.debug( 3336 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3337 ) 3338 3339 if annotations: 3340 3341 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3342 3343 # Export VCF file 3344 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3345 3346 # annotation_bigwig_config 3347 annotation_bigwig_config_list = [] 3348 3349 for annotation in annotations: 3350 annotation_fields = annotations[annotation] 3351 3352 # Annotation Name 3353 annotation_name = os.path.basename(annotation) 3354 3355 if not annotation_fields: 3356 annotation_fields = {"INFO": None} 3357 3358 log.debug(f"Annotation '{annotation_name}'") 3359 log.debug( 3360 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3361 ) 3362 3363 # Create Database 3364 database = Database( 3365 database=annotation, 3366 databases_folders=databases_folders, 3367 assembly=assembly, 3368 ) 3369 3370 # Find files 3371 db_file = database.get_database() 3372 db_file = full_path(db_file) 3373 db_hdr_file = database.get_header_file() 3374 db_hdr_file = full_path(db_hdr_file) 3375 db_file_type = database.get_format() 3376 3377 # If db_file is http ? 3378 if database.get_database().startswith("http"): 3379 3380 # Datbase is HTTP URL 3381 db_file_is_http = True 3382 3383 # DB file keep as URL 3384 db_file = database.get_database() 3385 log.warning( 3386 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3387 ) 3388 3389 # Retrieve automatic annotation field name 3390 annotation_field = clean_annotation_field( 3391 os.path.basename(db_file).replace(".bw", "") 3392 ) 3393 log.debug( 3394 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3395 ) 3396 3397 # Create automatic header file 3398 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3399 with open(db_hdr_file, "w") as f: 3400 f.write("##fileformat=VCFv4.2\n") 3401 f.write( 3402 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3403 ) 3404 f.write(f"#CHROM START END {annotation_field}\n") 3405 3406 else: 3407 3408 # Datbase is NOT HTTP URL 3409 db_file_is_http = False 3410 3411 # Check index - try to create if not exists 3412 if ( 3413 db_file is None 3414 or db_hdr_file is None 3415 or (not os.path.exists(db_file) and not db_file_is_http) 3416 or not os.path.exists(db_hdr_file) 3417 or not db_file_type in ["bw"] 3418 ): 3419 # if False: 3420 log.error("Annotation failed: database not valid") 3421 log.error(f"Annotation annotation file: {db_file}") 3422 log.error(f"Annotation annotation file type: {db_file_type}") 3423 log.error(f"Annotation annotation header: {db_hdr_file}") 3424 raise ValueError( 3425 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3426 ) 3427 else: 3428 3429 # Log 3430 log.debug( 3431 f"Annotation '{annotation}' - file: " 3432 + str(db_file) 3433 + " and " 3434 + str(db_hdr_file) 3435 ) 3436 3437 # Load header as VCF object 3438 db_hdr_vcf = Variants(input=db_hdr_file) 3439 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3440 log.debug( 3441 "Annotation database header: " 3442 + str(db_hdr_vcf_header_infos) 3443 ) 3444 3445 # For all fields in database 3446 annotation_fields_full = False 3447 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3448 annotation_fields = { 3449 key: key for key in db_hdr_vcf_header_infos 3450 } 3451 log.debug( 3452 "Annotation database header - All annotations added: " 3453 + str(annotation_fields) 3454 ) 3455 annotation_fields_full = True 3456 3457 # Init 3458 cyvcf2_header_rename_dict = {} 3459 cyvcf2_header_list = [] 3460 cyvcf2_header_indexes = {} 3461 3462 # process annotation fields 3463 for annotation_field in annotation_fields: 3464 3465 # New annotation name 3466 annotation_field_new = annotation_fields[annotation_field] 3467 3468 # Check annotation field and index in header 3469 if ( 3470 annotation_field 3471 in db_hdr_vcf.get_header_columns_as_list() 3472 ): 3473 annotation_field_index = ( 3474 db_hdr_vcf.get_header_columns_as_list().index( 3475 annotation_field 3476 ) 3477 - 3 3478 ) 3479 cyvcf2_header_indexes[annotation_field_new] = ( 3480 annotation_field_index 3481 ) 3482 else: 3483 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3484 log.error(msg_err) 3485 raise ValueError(msg_err) 3486 3487 # Append annotation field in cyvcf2 header list 3488 cyvcf2_header_rename_dict[annotation_field_new] = ( 3489 db_hdr_vcf_header_infos[annotation_field].id 3490 ) 3491 cyvcf2_header_list.append( 3492 { 3493 "ID": annotation_field_new, 3494 "Number": db_hdr_vcf_header_infos[ 3495 annotation_field 3496 ].num, 3497 "Type": db_hdr_vcf_header_infos[ 3498 annotation_field 3499 ].type, 3500 "Description": db_hdr_vcf_header_infos[ 3501 annotation_field 3502 ].desc, 3503 } 3504 ) 3505 3506 # Add header on VCF 3507 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3508 annotation_field_new, 3509 db_hdr_vcf_header_infos[annotation_field].num, 3510 db_hdr_vcf_header_infos[annotation_field].type, 3511 db_hdr_vcf_header_infos[annotation_field].desc, 3512 "HOWARD BigWig annotation", 3513 "unknown", 3514 self.code_type_map[ 3515 db_hdr_vcf_header_infos[annotation_field].type 3516 ], 3517 ) 3518 3519 # Load bigwig database 3520 bw_db = pyBigWig.open(db_file) 3521 if bw_db.isBigWig(): 3522 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3523 else: 3524 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3525 log.error(msg_err) 3526 raise ValueError(msg_err) 3527 3528 annotation_bigwig_config_list.append( 3529 { 3530 "db_file": db_file, 3531 "bw_db": bw_db, 3532 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3533 "cyvcf2_header_list": cyvcf2_header_list, 3534 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3535 } 3536 ) 3537 3538 # Annotate 3539 if annotation_bigwig_config_list: 3540 3541 # Annotation config 3542 log.debug( 3543 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3544 ) 3545 3546 # Export VCF file 3547 self.export_variant_vcf( 3548 vcf_file=tmp_vcf_name, 3549 remove_info=True, 3550 add_samples=False, 3551 index=True, 3552 ) 3553 3554 # Load input tmp file 3555 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3556 3557 # Add header in input file 3558 for annotation_bigwig_config in annotation_bigwig_config_list: 3559 for cyvcf2_header_field in annotation_bigwig_config.get( 3560 "cyvcf2_header_list", [] 3561 ): 3562 log.info( 3563 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3564 ) 3565 input_vcf.add_info_to_header(cyvcf2_header_field) 3566 3567 # Create output VCF file 3568 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3569 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3570 3571 # Fetch variants 3572 log.info(f"Annotations 'bigwig' start...") 3573 for variant in input_vcf: 3574 3575 for annotation_bigwig_config in annotation_bigwig_config_list: 3576 3577 # DB and indexes 3578 bw_db = annotation_bigwig_config.get("bw_db", None) 3579 cyvcf2_header_indexes = annotation_bigwig_config.get( 3580 "cyvcf2_header_indexes", None 3581 ) 3582 3583 # Retrieve value from chrom pos 3584 res = bw_db.values( 3585 variant.CHROM, variant.POS - 1, variant.POS 3586 ) 3587 3588 # For each annotation fields (and indexes) 3589 for cyvcf2_header_index in cyvcf2_header_indexes: 3590 3591 # If value is NOT nNone 3592 if not np.isnan( 3593 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3594 ): 3595 variant.INFO[cyvcf2_header_index] = res[ 3596 cyvcf2_header_indexes[cyvcf2_header_index] 3597 ] 3598 3599 # Add record in output file 3600 output_vcf.write_record(variant) 3601 3602 # Log 3603 log.debug(f"Annotation done.") 3604 3605 # Close and write file 3606 log.info(f"Annotations 'bigwig' write...") 3607 output_vcf.close() 3608 log.debug(f"Write done.") 3609 3610 # Update variants 3611 log.info(f"Annotations 'bigwig' update...") 3612 self.update_from_vcf(output_vcf_file) 3613 log.debug(f"Update done.") 3614 3615 return True 3616 3617 def annotation_snpsift(self, threads: int = None) -> None: 3618 """ 3619 This function annotate with bcftools 3620 3621 :param threads: Number of threads to use 3622 :return: the value of the variable "return_value". 3623 """ 3624 3625 # DEBUG 3626 log.debug("Start annotation with bcftools databases") 3627 3628 # Threads 3629 if not threads: 3630 threads = self.get_threads() 3631 log.debug("Threads: " + str(threads)) 3632 3633 # Config 3634 config = self.get_config() 3635 log.debug("Config: " + str(config)) 3636 3637 # Config - snpSift 3638 snpsift_bin_command = get_bin_command( 3639 bin="SnpSift.jar", 3640 tool="snpsift", 3641 bin_type="jar", 3642 config=config, 3643 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3644 ) 3645 if not snpsift_bin_command: 3646 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3647 log.error(msg_err) 3648 raise ValueError(msg_err) 3649 3650 # Config - bcftools 3651 bcftools_bin_command = get_bin_command( 3652 bin="bcftools", 3653 tool="bcftools", 3654 bin_type="bin", 3655 config=config, 3656 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3657 ) 3658 if not bcftools_bin_command: 3659 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3660 log.error(msg_err) 3661 raise ValueError(msg_err) 3662 3663 # Config - BCFTools databases folders 3664 databases_folders = set( 3665 self.get_config() 3666 .get("folders", {}) 3667 .get("databases", {}) 3668 .get("annotations", ["."]) 3669 + self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("bcftools", ["."]) 3673 ) 3674 log.debug("Databases annotations: " + str(databases_folders)) 3675 3676 # Param 3677 annotations = ( 3678 self.get_param() 3679 .get("annotation", {}) 3680 .get("snpsift", {}) 3681 .get("annotations", None) 3682 ) 3683 log.debug("Annotations: " + str(annotations)) 3684 3685 # Assembly 3686 assembly = self.get_param().get( 3687 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3688 ) 3689 3690 # Data 3691 table_variants = self.get_table_variants() 3692 3693 # Check if not empty 3694 log.debug("Check if not empty") 3695 sql_query_chromosomes = ( 3696 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3697 ) 3698 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3699 if not sql_query_chromosomes_df["count"][0]: 3700 log.info(f"VCF empty") 3701 return 3702 3703 # VCF header 3704 vcf_reader = self.get_header() 3705 log.debug("Initial header: " + str(vcf_reader.infos)) 3706 3707 # Existing annotations 3708 for vcf_annotation in self.get_header().infos: 3709 3710 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3711 log.debug( 3712 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3713 ) 3714 3715 if annotations: 3716 3717 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3718 3719 # Export VCF file 3720 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3721 3722 # Init 3723 commands = {} 3724 3725 for annotation in annotations: 3726 annotation_fields = annotations[annotation] 3727 3728 # Annotation Name 3729 annotation_name = os.path.basename(annotation) 3730 3731 if not annotation_fields: 3732 annotation_fields = {"INFO": None} 3733 3734 log.debug(f"Annotation '{annotation_name}'") 3735 log.debug( 3736 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3737 ) 3738 3739 # Create Database 3740 database = Database( 3741 database=annotation, 3742 databases_folders=databases_folders, 3743 assembly=assembly, 3744 ) 3745 3746 # Find files 3747 db_file = database.get_database() 3748 db_file = full_path(db_file) 3749 db_hdr_file = database.get_header_file() 3750 db_hdr_file = full_path(db_hdr_file) 3751 db_file_type = database.get_format() 3752 db_tbi_file = f"{db_file}.tbi" 3753 db_file_compressed = database.is_compressed() 3754 3755 # Check if compressed 3756 if not db_file_compressed: 3757 log.error( 3758 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3759 ) 3760 raise ValueError( 3761 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3762 ) 3763 3764 # Check if indexed 3765 if not os.path.exists(db_tbi_file): 3766 log.error( 3767 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3768 ) 3769 raise ValueError( 3770 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3771 ) 3772 3773 # Check index - try to create if not exists 3774 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3775 log.error("Annotation failed: database not valid") 3776 log.error(f"Annotation annotation file: {db_file}") 3777 log.error(f"Annotation annotation header: {db_hdr_file}") 3778 log.error(f"Annotation annotation index: {db_tbi_file}") 3779 raise ValueError( 3780 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3781 ) 3782 else: 3783 3784 log.debug( 3785 f"Annotation '{annotation}' - file: " 3786 + str(db_file) 3787 + " and " 3788 + str(db_hdr_file) 3789 ) 3790 3791 # Load header as VCF object 3792 db_hdr_vcf = Variants(input=db_hdr_file) 3793 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3794 log.debug( 3795 "Annotation database header: " 3796 + str(db_hdr_vcf_header_infos) 3797 ) 3798 3799 # For all fields in database 3800 annotation_fields_full = False 3801 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3802 annotation_fields = { 3803 key: key for key in db_hdr_vcf_header_infos 3804 } 3805 log.debug( 3806 "Annotation database header - All annotations added: " 3807 + str(annotation_fields) 3808 ) 3809 annotation_fields_full = True 3810 3811 # # Create file for field rename 3812 # log.debug("Create file for field rename") 3813 # tmp_rename = NamedTemporaryFile( 3814 # prefix=self.get_prefix(), 3815 # dir=self.get_tmp_dir(), 3816 # suffix=".rename", 3817 # delete=False, 3818 # ) 3819 # tmp_rename_name = tmp_rename.name 3820 # tmp_files.append(tmp_rename_name) 3821 3822 # Number of fields 3823 nb_annotation_field = 0 3824 annotation_list = [] 3825 annotation_infos_rename_list = [] 3826 3827 for annotation_field in annotation_fields: 3828 3829 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3830 annotation_fields_new_name = annotation_fields.get( 3831 annotation_field, annotation_field 3832 ) 3833 if not annotation_fields_new_name: 3834 annotation_fields_new_name = annotation_field 3835 3836 # Check if field is in DB and if field is not elready in input data 3837 if ( 3838 annotation_field in db_hdr_vcf.get_header().infos 3839 and annotation_fields_new_name 3840 not in self.get_header().infos 3841 ): 3842 3843 log.info( 3844 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3845 ) 3846 3847 # BCFTools annotate param to rename fields 3848 if annotation_field != annotation_fields_new_name: 3849 annotation_infos_rename_list.append( 3850 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3851 ) 3852 3853 # Add INFO field to header 3854 db_hdr_vcf_header_infos_number = ( 3855 db_hdr_vcf_header_infos[annotation_field].num or "." 3856 ) 3857 db_hdr_vcf_header_infos_type = ( 3858 db_hdr_vcf_header_infos[annotation_field].type 3859 or "String" 3860 ) 3861 db_hdr_vcf_header_infos_description = ( 3862 db_hdr_vcf_header_infos[annotation_field].desc 3863 or f"{annotation_field} description" 3864 ) 3865 db_hdr_vcf_header_infos_source = ( 3866 db_hdr_vcf_header_infos[annotation_field].source 3867 or "unknown" 3868 ) 3869 db_hdr_vcf_header_infos_version = ( 3870 db_hdr_vcf_header_infos[annotation_field].version 3871 or "unknown" 3872 ) 3873 3874 vcf_reader.infos[annotation_fields_new_name] = ( 3875 vcf.parser._Info( 3876 annotation_fields_new_name, 3877 db_hdr_vcf_header_infos_number, 3878 db_hdr_vcf_header_infos_type, 3879 db_hdr_vcf_header_infos_description, 3880 db_hdr_vcf_header_infos_source, 3881 db_hdr_vcf_header_infos_version, 3882 self.code_type_map[ 3883 db_hdr_vcf_header_infos_type 3884 ], 3885 ) 3886 ) 3887 3888 annotation_list.append(annotation_field) 3889 3890 nb_annotation_field += 1 3891 3892 else: 3893 3894 if ( 3895 annotation_field 3896 not in db_hdr_vcf.get_header().infos 3897 ): 3898 log.warning( 3899 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3900 ) 3901 if ( 3902 annotation_fields_new_name 3903 in self.get_header().infos 3904 ): 3905 log.warning( 3906 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3907 ) 3908 3909 log.info( 3910 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3911 ) 3912 3913 annotation_infos = ",".join(annotation_list) 3914 3915 if annotation_infos != "": 3916 3917 # Annotated VCF (and error file) 3918 tmp_annotation_vcf_name = os.path.join( 3919 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3920 ) 3921 tmp_annotation_vcf_name_err = ( 3922 tmp_annotation_vcf_name + ".err" 3923 ) 3924 3925 # Add fields to annotate 3926 if not annotation_fields_full: 3927 annotation_infos_option = f"-info {annotation_infos}" 3928 else: 3929 annotation_infos_option = "" 3930 3931 # Info fields rename 3932 if annotation_infos_rename_list: 3933 annotation_infos_rename = " -c " + ",".join( 3934 annotation_infos_rename_list 3935 ) 3936 else: 3937 annotation_infos_rename = "" 3938 3939 # Annotate command 3940 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3941 3942 # Add command 3943 commands[command_annotate] = tmp_annotation_vcf_name 3944 3945 if commands: 3946 3947 # Export VCF file 3948 self.export_variant_vcf( 3949 vcf_file=tmp_vcf_name, 3950 remove_info=True, 3951 add_samples=False, 3952 index=True, 3953 ) 3954 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3955 3956 # Num command 3957 nb_command = 0 3958 3959 # Annotate 3960 for command_annotate in commands: 3961 nb_command += 1 3962 log.info( 3963 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3964 ) 3965 log.debug(f"command_annotate={command_annotate}") 3966 run_parallel_commands([command_annotate], threads) 3967 3968 # Debug 3969 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3970 3971 # Update variants 3972 log.info( 3973 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3974 ) 3975 self.update_from_vcf(commands[command_annotate]) 3976 3977 def annotation_bcftools(self, threads: int = None) -> None: 3978 """ 3979 This function annotate with bcftools 3980 3981 :param threads: Number of threads to use 3982 :return: the value of the variable "return_value". 3983 """ 3984 3985 # DEBUG 3986 log.debug("Start annotation with bcftools databases") 3987 3988 # Threads 3989 if not threads: 3990 threads = self.get_threads() 3991 log.debug("Threads: " + str(threads)) 3992 3993 # Config 3994 config = self.get_config() 3995 log.debug("Config: " + str(config)) 3996 3997 # DEBUG 3998 delete_tmp = True 3999 if self.get_config().get("verbosity", "warning") in ["debug"]: 4000 delete_tmp = False 4001 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4002 4003 # Config - BCFTools bin command 4004 bcftools_bin_command = get_bin_command( 4005 bin="bcftools", 4006 tool="bcftools", 4007 bin_type="bin", 4008 config=config, 4009 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4010 ) 4011 if not bcftools_bin_command: 4012 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4013 log.error(msg_err) 4014 raise ValueError(msg_err) 4015 4016 # Config - BCFTools databases folders 4017 databases_folders = set( 4018 self.get_config() 4019 .get("folders", {}) 4020 .get("databases", {}) 4021 .get("annotations", ["."]) 4022 + self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("bcftools", ["."]) 4026 ) 4027 log.debug("Databases annotations: " + str(databases_folders)) 4028 4029 # Param 4030 annotations = ( 4031 self.get_param() 4032 .get("annotation", {}) 4033 .get("bcftools", {}) 4034 .get("annotations", None) 4035 ) 4036 log.debug("Annotations: " + str(annotations)) 4037 4038 # Assembly 4039 assembly = self.get_param().get( 4040 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4041 ) 4042 4043 # Data 4044 table_variants = self.get_table_variants() 4045 4046 # Check if not empty 4047 log.debug("Check if not empty") 4048 sql_query_chromosomes = ( 4049 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4050 ) 4051 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4052 if not sql_query_chromosomes_df["count"][0]: 4053 log.info(f"VCF empty") 4054 return 4055 4056 # Export in VCF 4057 log.debug("Create initial file to annotate") 4058 tmp_vcf = NamedTemporaryFile( 4059 prefix=self.get_prefix(), 4060 dir=self.get_tmp_dir(), 4061 suffix=".vcf.gz", 4062 delete=False, 4063 ) 4064 tmp_vcf_name = tmp_vcf.name 4065 4066 # VCF header 4067 vcf_reader = self.get_header() 4068 log.debug("Initial header: " + str(vcf_reader.infos)) 4069 4070 # Existing annotations 4071 for vcf_annotation in self.get_header().infos: 4072 4073 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4074 log.debug( 4075 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4076 ) 4077 4078 if annotations: 4079 4080 tmp_ann_vcf_list = [] 4081 commands = [] 4082 tmp_files = [] 4083 err_files = [] 4084 4085 for annotation in annotations: 4086 annotation_fields = annotations[annotation] 4087 4088 # Annotation Name 4089 annotation_name = os.path.basename(annotation) 4090 4091 if not annotation_fields: 4092 annotation_fields = {"INFO": None} 4093 4094 log.debug(f"Annotation '{annotation_name}'") 4095 log.debug( 4096 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4097 ) 4098 4099 # Create Database 4100 database = Database( 4101 database=annotation, 4102 databases_folders=databases_folders, 4103 assembly=assembly, 4104 ) 4105 4106 # Find files 4107 db_file = database.get_database() 4108 db_file = full_path(db_file) 4109 db_hdr_file = database.get_header_file() 4110 db_hdr_file = full_path(db_hdr_file) 4111 db_file_type = database.get_format() 4112 db_tbi_file = f"{db_file}.tbi" 4113 db_file_compressed = database.is_compressed() 4114 4115 # Check if compressed 4116 if not db_file_compressed: 4117 log.error( 4118 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4119 ) 4120 raise ValueError( 4121 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4122 ) 4123 4124 # Check if indexed 4125 if not os.path.exists(db_tbi_file): 4126 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4127 raise ValueError( 4128 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4129 ) 4130 4131 # Check index - try to create if not exists 4132 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4133 log.error("Annotation failed: database not valid") 4134 log.error(f"Annotation annotation file: {db_file}") 4135 log.error(f"Annotation annotation header: {db_hdr_file}") 4136 log.error(f"Annotation annotation index: {db_tbi_file}") 4137 raise ValueError( 4138 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4139 ) 4140 else: 4141 4142 log.debug( 4143 f"Annotation '{annotation}' - file: " 4144 + str(db_file) 4145 + " and " 4146 + str(db_hdr_file) 4147 ) 4148 4149 # Load header as VCF object 4150 db_hdr_vcf = Variants(input=db_hdr_file) 4151 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4152 log.debug( 4153 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4154 ) 4155 4156 # For all fields in database 4157 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4158 annotation_fields = { 4159 key: key for key in db_hdr_vcf_header_infos 4160 } 4161 log.debug( 4162 "Annotation database header - All annotations added: " 4163 + str(annotation_fields) 4164 ) 4165 4166 # Number of fields 4167 nb_annotation_field = 0 4168 annotation_list = [] 4169 4170 for annotation_field in annotation_fields: 4171 4172 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4173 annotation_fields_new_name = annotation_fields.get( 4174 annotation_field, annotation_field 4175 ) 4176 if not annotation_fields_new_name: 4177 annotation_fields_new_name = annotation_field 4178 4179 # Check if field is in DB and if field is not elready in input data 4180 if ( 4181 annotation_field in db_hdr_vcf.get_header().infos 4182 and annotation_fields_new_name 4183 not in self.get_header().infos 4184 ): 4185 4186 log.info( 4187 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4188 ) 4189 4190 # Add INFO field to header 4191 db_hdr_vcf_header_infos_number = ( 4192 db_hdr_vcf_header_infos[annotation_field].num or "." 4193 ) 4194 db_hdr_vcf_header_infos_type = ( 4195 db_hdr_vcf_header_infos[annotation_field].type 4196 or "String" 4197 ) 4198 db_hdr_vcf_header_infos_description = ( 4199 db_hdr_vcf_header_infos[annotation_field].desc 4200 or f"{annotation_field} description" 4201 ) 4202 db_hdr_vcf_header_infos_source = ( 4203 db_hdr_vcf_header_infos[annotation_field].source 4204 or "unknown" 4205 ) 4206 db_hdr_vcf_header_infos_version = ( 4207 db_hdr_vcf_header_infos[annotation_field].version 4208 or "unknown" 4209 ) 4210 4211 vcf_reader.infos[annotation_fields_new_name] = ( 4212 vcf.parser._Info( 4213 annotation_fields_new_name, 4214 db_hdr_vcf_header_infos_number, 4215 db_hdr_vcf_header_infos_type, 4216 db_hdr_vcf_header_infos_description, 4217 db_hdr_vcf_header_infos_source, 4218 db_hdr_vcf_header_infos_version, 4219 self.code_type_map[db_hdr_vcf_header_infos_type], 4220 ) 4221 ) 4222 4223 # annotation_list.append(annotation_field) 4224 if annotation_field != annotation_fields_new_name: 4225 annotation_list.append( 4226 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4227 ) 4228 else: 4229 annotation_list.append(annotation_field) 4230 4231 nb_annotation_field += 1 4232 4233 else: 4234 4235 if annotation_field not in db_hdr_vcf.get_header().infos: 4236 log.warning( 4237 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4238 ) 4239 if annotation_fields_new_name in self.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4242 ) 4243 4244 log.info( 4245 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4246 ) 4247 4248 annotation_infos = ",".join(annotation_list) 4249 4250 if annotation_infos != "": 4251 4252 # Protect header for bcftools (remove "#CHROM" and variants line) 4253 log.debug("Protect Header file - remove #CHROM line if exists") 4254 tmp_header_vcf = NamedTemporaryFile( 4255 prefix=self.get_prefix(), 4256 dir=self.get_tmp_dir(), 4257 suffix=".hdr", 4258 delete=False, 4259 ) 4260 tmp_header_vcf_name = tmp_header_vcf.name 4261 tmp_files.append(tmp_header_vcf_name) 4262 # Command 4263 if db_hdr_file.endswith(".gz"): 4264 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4265 else: 4266 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4267 # Run 4268 run_parallel_commands([command_extract_header], 1) 4269 4270 # Find chomosomes 4271 log.debug("Find chromosomes ") 4272 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4273 sql_query_chromosomes_df = self.get_query_to_df( 4274 sql_query_chromosomes 4275 ) 4276 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4277 4278 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4279 4280 # BED columns in the annotation file 4281 if db_file_type in ["bed"]: 4282 annotation_infos = "CHROM,POS,POS," + annotation_infos 4283 4284 for chrom in chomosomes_list: 4285 4286 # Create BED on initial VCF 4287 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4288 tmp_bed = NamedTemporaryFile( 4289 prefix=self.get_prefix(), 4290 dir=self.get_tmp_dir(), 4291 suffix=".bed", 4292 delete=False, 4293 ) 4294 tmp_bed_name = tmp_bed.name 4295 tmp_files.append(tmp_bed_name) 4296 4297 # Detecte regions 4298 log.debug( 4299 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4300 ) 4301 window = 1000000 4302 sql_query_intervals_for_bed = f""" 4303 SELECT \"#CHROM\", 4304 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4305 \"POS\"+{window} 4306 FROM {table_variants} as table_variants 4307 WHERE table_variants.\"#CHROM\" = '{chrom}' 4308 """ 4309 regions = self.conn.execute( 4310 sql_query_intervals_for_bed 4311 ).fetchall() 4312 merged_regions = merge_regions(regions) 4313 log.debug( 4314 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4315 ) 4316 4317 header = ["#CHROM", "START", "END"] 4318 with open(tmp_bed_name, "w") as f: 4319 # Write the header with tab delimiter 4320 f.write("\t".join(header) + "\n") 4321 for d in merged_regions: 4322 # Write each data row with tab delimiter 4323 f.write("\t".join(map(str, d)) + "\n") 4324 4325 # Tmp files 4326 tmp_annotation_vcf = NamedTemporaryFile( 4327 prefix=self.get_prefix(), 4328 dir=self.get_tmp_dir(), 4329 suffix=".vcf.gz", 4330 delete=False, 4331 ) 4332 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4333 tmp_files.append(tmp_annotation_vcf_name) 4334 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4335 tmp_annotation_vcf_name_err = ( 4336 tmp_annotation_vcf_name + ".err" 4337 ) 4338 err_files.append(tmp_annotation_vcf_name_err) 4339 4340 # Annotate Command 4341 log.debug( 4342 f"Annotation '{annotation}' - add bcftools command" 4343 ) 4344 4345 # Command 4346 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4347 4348 # Add command 4349 commands.append(command_annotate) 4350 4351 # if some commands 4352 if commands: 4353 4354 # Export VCF file 4355 self.export_variant_vcf( 4356 vcf_file=tmp_vcf_name, 4357 remove_info=True, 4358 add_samples=False, 4359 index=True, 4360 ) 4361 4362 # Threads 4363 # calculate threads for annotated commands 4364 if commands: 4365 threads_bcftools_annotate = round(threads / len(commands)) 4366 else: 4367 threads_bcftools_annotate = 1 4368 4369 if not threads_bcftools_annotate: 4370 threads_bcftools_annotate = 1 4371 4372 # Add threads option to bcftools commands 4373 if threads_bcftools_annotate > 1: 4374 commands_threaded = [] 4375 for command in commands: 4376 commands_threaded.append( 4377 command.replace( 4378 f"{bcftools_bin_command} annotate ", 4379 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4380 ) 4381 ) 4382 commands = commands_threaded 4383 4384 # Command annotation multithreading 4385 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4386 log.info( 4387 f"Annotation - Annotation multithreaded in " 4388 + str(len(commands)) 4389 + " commands" 4390 ) 4391 4392 run_parallel_commands(commands, threads) 4393 4394 # Merge 4395 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4396 4397 if tmp_ann_vcf_list_cmd: 4398 4399 # Tmp file 4400 tmp_annotate_vcf = NamedTemporaryFile( 4401 prefix=self.get_prefix(), 4402 dir=self.get_tmp_dir(), 4403 suffix=".vcf.gz", 4404 delete=True, 4405 ) 4406 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4407 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4408 err_files.append(tmp_annotate_vcf_name_err) 4409 4410 # Tmp file remove command 4411 tmp_files_remove_command = "" 4412 if tmp_files: 4413 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4414 4415 # Command merge 4416 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4417 log.info( 4418 f"Annotation - Annotation merging " 4419 + str(len(commands)) 4420 + " annotated files" 4421 ) 4422 log.debug(f"Annotation - merge command: {merge_command}") 4423 run_parallel_commands([merge_command], 1) 4424 4425 # Error messages 4426 log.info(f"Error/Warning messages:") 4427 error_message_command_all = [] 4428 error_message_command_warning = [] 4429 error_message_command_err = [] 4430 for err_file in err_files: 4431 with open(err_file, "r") as f: 4432 for line in f: 4433 message = line.strip() 4434 error_message_command_all.append(message) 4435 if line.startswith("[W::"): 4436 error_message_command_warning.append(message) 4437 if line.startswith("[E::"): 4438 error_message_command_err.append( 4439 f"{err_file}: " + message 4440 ) 4441 # log info 4442 for message in list( 4443 set(error_message_command_err + error_message_command_warning) 4444 ): 4445 log.info(f" {message}") 4446 # debug info 4447 for message in list(set(error_message_command_all)): 4448 log.debug(f" {message}") 4449 # failed 4450 if len(error_message_command_err): 4451 log.error("Annotation failed: Error in commands") 4452 raise ValueError("Annotation failed: Error in commands") 4453 4454 # Update variants 4455 log.info(f"Annotation - Updating...") 4456 self.update_from_vcf(tmp_annotate_vcf_name) 4457 4458 def annotation_exomiser(self, threads: int = None) -> None: 4459 """ 4460 This function annotate with Exomiser 4461 4462 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4463 - "analysis" (dict/file): 4464 Full analysis dictionnary parameters (see Exomiser docs). 4465 Either a dict, or a file in JSON or YAML format. 4466 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4467 Default : None 4468 - "preset" (string): 4469 Analysis preset (available in config folder). 4470 Used if no full "analysis" is provided. 4471 Default: "exome" 4472 - "phenopacket" (dict/file): 4473 Samples and phenotipic features parameters (see Exomiser docs). 4474 Either a dict, or a file in JSON or YAML format. 4475 Default: None 4476 - "subject" (dict): 4477 Sample parameters (see Exomiser docs). 4478 Example: 4479 "subject": 4480 { 4481 "id": "ISDBM322017", 4482 "sex": "FEMALE" 4483 } 4484 Default: None 4485 - "sample" (string): 4486 Sample name to construct "subject" section: 4487 "subject": 4488 { 4489 "id": "<sample>", 4490 "sex": "UNKNOWN_SEX" 4491 } 4492 Default: None 4493 - "phenotypicFeatures" (dict) 4494 Phenotypic features to construct "subject" section. 4495 Example: 4496 "phenotypicFeatures": 4497 [ 4498 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4499 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4500 ] 4501 - "hpo" (list) 4502 List of HPO ids as phenotypic features. 4503 Example: 4504 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4505 Default: [] 4506 - "outputOptions" (dict): 4507 Output options (see Exomiser docs). 4508 Default: 4509 "output_options" = 4510 { 4511 "outputContributingVariantsOnly": False, 4512 "numGenes": 0, 4513 "outputFormats": ["TSV_VARIANT", "VCF"] 4514 } 4515 - "transcript_source" (string): 4516 Transcript source (either "refseq", "ucsc", "ensembl") 4517 Default: "refseq" 4518 - "exomiser_to_info" (boolean): 4519 Add exomiser TSV file columns as INFO fields in VCF. 4520 Default: False 4521 - "release" (string): 4522 Exomise database release. 4523 If not exists, database release will be downloaded (take a while). 4524 Default: None (provided by application.properties configuration file) 4525 - "exomiser_application_properties" (file): 4526 Exomiser configuration file (see Exomiser docs). 4527 Useful to automatically download databases (especially for specific genome databases). 4528 4529 Notes: 4530 - If no sample in parameters, first sample in VCF will be chosen 4531 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4532 4533 :param threads: The number of threads to use 4534 :return: None. 4535 """ 4536 4537 # DEBUG 4538 log.debug("Start annotation with Exomiser databases") 4539 4540 # Threads 4541 if not threads: 4542 threads = self.get_threads() 4543 log.debug("Threads: " + str(threads)) 4544 4545 # Config 4546 config = self.get_config() 4547 log.debug("Config: " + str(config)) 4548 4549 # Config - Folders - Databases 4550 databases_folders = ( 4551 config.get("folders", {}) 4552 .get("databases", {}) 4553 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4554 ) 4555 databases_folders = full_path(databases_folders) 4556 if not os.path.exists(databases_folders): 4557 log.error(f"Databases annotations: {databases_folders} NOT found") 4558 log.debug("Databases annotations: " + str(databases_folders)) 4559 4560 # Config - Exomiser 4561 exomiser_bin_command = get_bin_command( 4562 bin="exomiser-cli*.jar", 4563 tool="exomiser", 4564 bin_type="jar", 4565 config=config, 4566 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4567 ) 4568 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4569 if not exomiser_bin_command: 4570 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4571 log.error(msg_err) 4572 raise ValueError(msg_err) 4573 4574 # Param 4575 param = self.get_param() 4576 log.debug("Param: " + str(param)) 4577 4578 # Param - Exomiser 4579 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4580 log.debug(f"Param Exomiser: {param_exomiser}") 4581 4582 # Param - Assembly 4583 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4584 log.debug("Assembly: " + str(assembly)) 4585 4586 # Data 4587 table_variants = self.get_table_variants() 4588 4589 # Check if not empty 4590 log.debug("Check if not empty") 4591 sql_query_chromosomes = ( 4592 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4593 ) 4594 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4595 log.info(f"VCF empty") 4596 return False 4597 4598 # VCF header 4599 vcf_reader = self.get_header() 4600 log.debug("Initial header: " + str(vcf_reader.infos)) 4601 4602 # Samples 4603 samples = self.get_header_sample_list() 4604 if not samples: 4605 log.error("No Samples in VCF") 4606 return False 4607 log.debug(f"Samples: {samples}") 4608 4609 # Memory limit 4610 memory_limit = self.get_memory("8G") 4611 log.debug(f"memory_limit: {memory_limit}") 4612 4613 # Exomiser java options 4614 exomiser_java_options = ( 4615 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4616 ) 4617 log.debug(f"Exomiser java options: {exomiser_java_options}") 4618 4619 # Download Exomiser (if not exists) 4620 exomiser_release = param_exomiser.get("release", None) 4621 exomiser_application_properties = param_exomiser.get( 4622 "exomiser_application_properties", None 4623 ) 4624 databases_download_exomiser( 4625 assemblies=[assembly], 4626 exomiser_folder=databases_folders, 4627 exomiser_release=exomiser_release, 4628 exomiser_phenotype_release=exomiser_release, 4629 exomiser_application_properties=exomiser_application_properties, 4630 ) 4631 4632 # Force annotation 4633 force_update_annotation = True 4634 4635 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4636 log.debug("Start annotation Exomiser") 4637 4638 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4639 4640 # tmp_dir = "/tmp/exomiser" 4641 4642 ### ANALYSIS ### 4643 ################ 4644 4645 # Create analysis.json through analysis dict 4646 # either analysis in param or by default 4647 # depending on preset exome/genome) 4648 4649 # Init analysis dict 4650 param_exomiser_analysis_dict = {} 4651 4652 # analysis from param 4653 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4654 param_exomiser_analysis = full_path(param_exomiser_analysis) 4655 4656 # If analysis in param -> load anlaysis json 4657 if param_exomiser_analysis: 4658 4659 # If param analysis is a file and exists 4660 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4661 param_exomiser_analysis 4662 ): 4663 # Load analysis file into analysis dict (either yaml or json) 4664 with open(param_exomiser_analysis) as json_file: 4665 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4666 4667 # If param analysis is a dict 4668 elif isinstance(param_exomiser_analysis, dict): 4669 # Load analysis dict into analysis dict (either yaml or json) 4670 param_exomiser_analysis_dict = param_exomiser_analysis 4671 4672 # Error analysis type 4673 else: 4674 log.error(f"Analysis type unknown. Check param file.") 4675 raise ValueError(f"Analysis type unknown. Check param file.") 4676 4677 # Case no input analysis config file/dict 4678 # Use preset (exome/genome) to open default config file 4679 if not param_exomiser_analysis_dict: 4680 4681 # default preset 4682 default_preset = "exome" 4683 4684 # Get param preset or default preset 4685 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4686 4687 # Try to find if preset is a file 4688 if os.path.exists(param_exomiser_preset): 4689 # Preset file is provided in full path 4690 param_exomiser_analysis_default_config_file = ( 4691 param_exomiser_preset 4692 ) 4693 # elif os.path.exists(full_path(param_exomiser_preset)): 4694 # # Preset file is provided in full path 4695 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4696 elif os.path.exists( 4697 os.path.join(folder_config, param_exomiser_preset) 4698 ): 4699 # Preset file is provided a basename in config folder (can be a path with subfolders) 4700 param_exomiser_analysis_default_config_file = os.path.join( 4701 folder_config, param_exomiser_preset 4702 ) 4703 else: 4704 # Construct preset file 4705 param_exomiser_analysis_default_config_file = os.path.join( 4706 folder_config, 4707 f"preset-{param_exomiser_preset}-analysis.json", 4708 ) 4709 4710 # If preset file exists 4711 param_exomiser_analysis_default_config_file = full_path( 4712 param_exomiser_analysis_default_config_file 4713 ) 4714 if os.path.exists(param_exomiser_analysis_default_config_file): 4715 # Load prest file into analysis dict (either yaml or json) 4716 with open( 4717 param_exomiser_analysis_default_config_file 4718 ) as json_file: 4719 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4720 json_file 4721 ) 4722 4723 # Error preset file 4724 else: 4725 log.error( 4726 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4727 ) 4728 raise ValueError( 4729 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4730 ) 4731 4732 # If no analysis dict created 4733 if not param_exomiser_analysis_dict: 4734 log.error(f"No analysis config") 4735 raise ValueError(f"No analysis config") 4736 4737 # Log 4738 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4739 4740 ### PHENOPACKET ### 4741 ################### 4742 4743 # If no PhenoPacket in analysis dict -> check in param 4744 if "phenopacket" not in param_exomiser_analysis_dict: 4745 4746 # If PhenoPacket in param -> load anlaysis json 4747 if param_exomiser.get("phenopacket", None): 4748 4749 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4750 param_exomiser_phenopacket = full_path( 4751 param_exomiser_phenopacket 4752 ) 4753 4754 # If param phenopacket is a file and exists 4755 if isinstance( 4756 param_exomiser_phenopacket, str 4757 ) and os.path.exists(param_exomiser_phenopacket): 4758 # Load phenopacket file into analysis dict (either yaml or json) 4759 with open(param_exomiser_phenopacket) as json_file: 4760 param_exomiser_analysis_dict["phenopacket"] = ( 4761 yaml.safe_load(json_file) 4762 ) 4763 4764 # If param phenopacket is a dict 4765 elif isinstance(param_exomiser_phenopacket, dict): 4766 # Load phenopacket dict into analysis dict (either yaml or json) 4767 param_exomiser_analysis_dict["phenopacket"] = ( 4768 param_exomiser_phenopacket 4769 ) 4770 4771 # Error phenopacket type 4772 else: 4773 log.error(f"Phenopacket type unknown. Check param file.") 4774 raise ValueError( 4775 f"Phenopacket type unknown. Check param file." 4776 ) 4777 4778 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4779 if "phenopacket" not in param_exomiser_analysis_dict: 4780 4781 # Init PhenoPacket 4782 param_exomiser_analysis_dict["phenopacket"] = { 4783 "id": "analysis", 4784 "proband": {}, 4785 } 4786 4787 ### Add subject ### 4788 4789 # If subject exists 4790 param_exomiser_subject = param_exomiser.get("subject", {}) 4791 4792 # If subject not exists -> found sample ID 4793 if not param_exomiser_subject: 4794 4795 # Found sample ID in param 4796 sample = param_exomiser.get("sample", None) 4797 4798 # Find sample ID (first sample) 4799 if not sample: 4800 sample_list = self.get_header_sample_list() 4801 if len(sample_list) > 0: 4802 sample = sample_list[0] 4803 else: 4804 log.error(f"No sample found") 4805 raise ValueError(f"No sample found") 4806 4807 # Create subject 4808 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4809 4810 # Add to dict 4811 param_exomiser_analysis_dict["phenopacket"][ 4812 "subject" 4813 ] = param_exomiser_subject 4814 4815 ### Add "phenotypicFeatures" ### 4816 4817 # If phenotypicFeatures exists 4818 param_exomiser_phenotypicfeatures = param_exomiser.get( 4819 "phenotypicFeatures", [] 4820 ) 4821 4822 # If phenotypicFeatures not exists -> Try to infer from hpo list 4823 if not param_exomiser_phenotypicfeatures: 4824 4825 # Found HPO in param 4826 param_exomiser_hpo = param_exomiser.get("hpo", []) 4827 4828 # Split HPO if list in string format separated by comma 4829 if isinstance(param_exomiser_hpo, str): 4830 param_exomiser_hpo = param_exomiser_hpo.split(",") 4831 4832 # Create HPO list 4833 for hpo in param_exomiser_hpo: 4834 hpo_clean = re.sub("[^0-9]", "", hpo) 4835 param_exomiser_phenotypicfeatures.append( 4836 { 4837 "type": { 4838 "id": f"HP:{hpo_clean}", 4839 "label": f"HP:{hpo_clean}", 4840 } 4841 } 4842 ) 4843 4844 # Add to dict 4845 param_exomiser_analysis_dict["phenopacket"][ 4846 "phenotypicFeatures" 4847 ] = param_exomiser_phenotypicfeatures 4848 4849 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4850 if not param_exomiser_phenotypicfeatures: 4851 for step in param_exomiser_analysis_dict.get( 4852 "analysis", {} 4853 ).get("steps", []): 4854 if "hiPhivePrioritiser" in step: 4855 param_exomiser_analysis_dict.get("analysis", {}).get( 4856 "steps", [] 4857 ).remove(step) 4858 4859 ### Add Input File ### 4860 4861 # Initial file name and htsFiles 4862 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4863 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4864 { 4865 "uri": tmp_vcf_name, 4866 "htsFormat": "VCF", 4867 "genomeAssembly": assembly, 4868 } 4869 ] 4870 4871 ### Add metaData ### 4872 4873 # If metaData not in analysis dict 4874 if "metaData" not in param_exomiser_analysis_dict: 4875 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4876 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4877 "createdBy": "howard", 4878 "phenopacketSchemaVersion": 1, 4879 } 4880 4881 ### OutputOptions ### 4882 4883 # Init output result folder 4884 output_results = os.path.join(tmp_dir, "results") 4885 4886 # If no outputOptions in analysis dict 4887 if "outputOptions" not in param_exomiser_analysis_dict: 4888 4889 # default output formats 4890 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4891 4892 # Get outputOptions in param 4893 output_options = param_exomiser.get("outputOptions", None) 4894 4895 # If no output_options in param -> check 4896 if not output_options: 4897 output_options = { 4898 "outputContributingVariantsOnly": False, 4899 "numGenes": 0, 4900 "outputFormats": defaut_output_formats, 4901 } 4902 4903 # Replace outputDirectory in output options 4904 output_options["outputDirectory"] = output_results 4905 output_options["outputFileName"] = "howard" 4906 4907 # Add outputOptions in analysis dict 4908 param_exomiser_analysis_dict["outputOptions"] = output_options 4909 4910 else: 4911 4912 # Replace output_results and output format (if exists in param) 4913 param_exomiser_analysis_dict["outputOptions"][ 4914 "outputDirectory" 4915 ] = output_results 4916 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4917 list( 4918 set( 4919 param_exomiser_analysis_dict.get( 4920 "outputOptions", {} 4921 ).get("outputFormats", []) 4922 + ["TSV_VARIANT", "VCF"] 4923 ) 4924 ) 4925 ) 4926 4927 # log 4928 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4929 4930 ### ANALYSIS FILE ### 4931 ##################### 4932 4933 ### Full JSON analysis config file ### 4934 4935 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4936 with open(exomiser_analysis, "w") as fp: 4937 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4938 4939 ### SPLIT analysis and sample config files 4940 4941 # Splitted analysis dict 4942 param_exomiser_analysis_dict_for_split = ( 4943 param_exomiser_analysis_dict.copy() 4944 ) 4945 4946 # Phenopacket JSON file 4947 exomiser_analysis_phenopacket = os.path.join( 4948 tmp_dir, "analysis_phenopacket.json" 4949 ) 4950 with open(exomiser_analysis_phenopacket, "w") as fp: 4951 json.dump( 4952 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4953 fp, 4954 indent=4, 4955 ) 4956 4957 # Analysis JSON file without Phenopacket parameters 4958 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4959 exomiser_analysis_analysis = os.path.join( 4960 tmp_dir, "analysis_analysis.json" 4961 ) 4962 with open(exomiser_analysis_analysis, "w") as fp: 4963 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4964 4965 ### INITAL VCF file ### 4966 ####################### 4967 4968 ### Create list of samples to use and include inti initial VCF file #### 4969 4970 # Subject (main sample) 4971 # Get sample ID in analysis dict 4972 sample_subject = ( 4973 param_exomiser_analysis_dict.get("phenopacket", {}) 4974 .get("subject", {}) 4975 .get("id", None) 4976 ) 4977 sample_proband = ( 4978 param_exomiser_analysis_dict.get("phenopacket", {}) 4979 .get("proband", {}) 4980 .get("subject", {}) 4981 .get("id", None) 4982 ) 4983 sample = [] 4984 if sample_subject: 4985 sample.append(sample_subject) 4986 if sample_proband: 4987 sample.append(sample_proband) 4988 4989 # Get sample ID within Pedigree 4990 pedigree_persons_list = ( 4991 param_exomiser_analysis_dict.get("phenopacket", {}) 4992 .get("pedigree", {}) 4993 .get("persons", {}) 4994 ) 4995 4996 # Create list with all sample ID in pedigree (if exists) 4997 pedigree_persons = [] 4998 for person in pedigree_persons_list: 4999 pedigree_persons.append(person.get("individualId")) 5000 5001 # Concat subject sample ID and samples ID in pedigreesamples 5002 samples = list(set(sample + pedigree_persons)) 5003 5004 # Check if sample list is not empty 5005 if not samples: 5006 log.error(f"No samples found") 5007 raise ValueError(f"No samples found") 5008 5009 # Create VCF with sample (either sample in param or first one by default) 5010 # Export VCF file 5011 self.export_variant_vcf( 5012 vcf_file=tmp_vcf_name, 5013 remove_info=True, 5014 add_samples=True, 5015 list_samples=samples, 5016 index=False, 5017 ) 5018 5019 ### Execute Exomiser ### 5020 ######################## 5021 5022 # Init command 5023 exomiser_command = "" 5024 5025 # Command exomiser options 5026 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5027 5028 # Release 5029 exomiser_release = param_exomiser.get("release", None) 5030 if exomiser_release: 5031 # phenotype data version 5032 exomiser_options += ( 5033 f" --exomiser.phenotype.data-version={exomiser_release} " 5034 ) 5035 # data version 5036 exomiser_options += ( 5037 f" --exomiser.{assembly}.data-version={exomiser_release} " 5038 ) 5039 # variant white list 5040 variant_white_list_file = ( 5041 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5042 ) 5043 if os.path.exists( 5044 os.path.join( 5045 databases_folders, assembly, variant_white_list_file 5046 ) 5047 ): 5048 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5049 5050 # transcript_source 5051 transcript_source = param_exomiser.get( 5052 "transcript_source", None 5053 ) # ucsc, refseq, ensembl 5054 if transcript_source: 5055 exomiser_options += ( 5056 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5057 ) 5058 5059 # If analysis contain proband param 5060 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5061 "proband", {} 5062 ): 5063 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5064 5065 # If no proband (usually uniq sample) 5066 else: 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5068 5069 # Log 5070 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5071 5072 # Run command 5073 result = subprocess.call( 5074 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5075 ) 5076 if result: 5077 log.error("Exomiser command failed") 5078 raise ValueError("Exomiser command failed") 5079 5080 ### RESULTS ### 5081 ############### 5082 5083 ### Annotate with TSV fields ### 5084 5085 # Init result tsv file 5086 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5087 5088 # Init result tsv file 5089 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5090 5091 # Parse TSV file and explode columns in INFO field 5092 if exomiser_to_info and os.path.exists(output_results_tsv): 5093 5094 # Log 5095 log.debug("Exomiser columns to VCF INFO field") 5096 5097 # Retrieve columns and types 5098 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5099 output_results_tsv_df = self.get_query_to_df(query) 5100 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5101 5102 # Init concat fields for update 5103 sql_query_update_concat_fields = [] 5104 5105 # Fields to avoid 5106 fields_to_avoid = [ 5107 "CONTIG", 5108 "START", 5109 "END", 5110 "REF", 5111 "ALT", 5112 "QUAL", 5113 "FILTER", 5114 "GENOTYPE", 5115 ] 5116 5117 # List all columns to add into header 5118 for header_column in output_results_tsv_columns: 5119 5120 # If header column is enable 5121 if header_column not in fields_to_avoid: 5122 5123 # Header info type 5124 header_info_type = "String" 5125 header_column_df = output_results_tsv_df[header_column] 5126 header_column_df_dtype = header_column_df.dtype 5127 if header_column_df_dtype == object: 5128 if ( 5129 pd.to_numeric(header_column_df, errors="coerce") 5130 .notnull() 5131 .all() 5132 ): 5133 header_info_type = "Float" 5134 else: 5135 header_info_type = "Integer" 5136 5137 # Header info 5138 characters_to_validate = ["-"] 5139 pattern = "[" + "".join(characters_to_validate) + "]" 5140 header_info_name = re.sub( 5141 pattern, 5142 "_", 5143 f"Exomiser_{header_column}".replace("#", ""), 5144 ) 5145 header_info_number = "." 5146 header_info_description = ( 5147 f"Exomiser {header_column} annotation" 5148 ) 5149 header_info_source = "Exomiser" 5150 header_info_version = "unknown" 5151 header_info_code = CODE_TYPE_MAP[header_info_type] 5152 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5153 header_info_name, 5154 header_info_number, 5155 header_info_type, 5156 header_info_description, 5157 header_info_source, 5158 header_info_version, 5159 header_info_code, 5160 ) 5161 5162 # Add field to add for update to concat fields 5163 sql_query_update_concat_fields.append( 5164 f""" 5165 CASE 5166 WHEN table_parquet."{header_column}" NOT IN ('','.') 5167 THEN concat( 5168 '{header_info_name}=', 5169 table_parquet."{header_column}", 5170 ';' 5171 ) 5172 5173 ELSE '' 5174 END 5175 """ 5176 ) 5177 5178 # Update query 5179 sql_query_update = f""" 5180 UPDATE {table_variants} as table_variants 5181 SET INFO = concat( 5182 CASE 5183 WHEN INFO NOT IN ('', '.') 5184 THEN INFO 5185 ELSE '' 5186 END, 5187 CASE 5188 WHEN table_variants.INFO NOT IN ('','.') 5189 THEN ';' 5190 ELSE '' 5191 END, 5192 ( 5193 SELECT 5194 concat( 5195 {",".join(sql_query_update_concat_fields)} 5196 ) 5197 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5198 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5199 AND table_parquet.\"START\" = table_variants.\"POS\" 5200 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5201 AND table_parquet.\"REF\" = table_variants.\"REF\" 5202 ) 5203 ) 5204 ; 5205 """ 5206 5207 # Update 5208 self.conn.execute(sql_query_update) 5209 5210 ### Annotate with VCF INFO field ### 5211 5212 # Init result VCF file 5213 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5214 5215 # If VCF exists 5216 if os.path.exists(output_results_vcf): 5217 5218 # Log 5219 log.debug("Exomiser result VCF update variants") 5220 5221 # Find Exomiser INFO field annotation in header 5222 with gzip.open(output_results_vcf, "rt") as f: 5223 header_list = self.read_vcf_header(f) 5224 exomiser_vcf_header = vcf.Reader( 5225 io.StringIO("\n".join(header_list)) 5226 ) 5227 5228 # Add annotation INFO field to header 5229 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5230 5231 # Update variants with VCF 5232 self.update_from_vcf(output_results_vcf) 5233 5234 return True 5235 5236 def annotation_snpeff(self, threads: int = None) -> None: 5237 """ 5238 This function annotate with snpEff 5239 5240 :param threads: The number of threads to use 5241 :return: the value of the variable "return_value". 5242 """ 5243 5244 # DEBUG 5245 log.debug("Start annotation with snpeff databases") 5246 5247 # Threads 5248 if not threads: 5249 threads = self.get_threads() 5250 log.debug("Threads: " + str(threads)) 5251 5252 # DEBUG 5253 delete_tmp = True 5254 if self.get_config().get("verbosity", "warning") in ["debug"]: 5255 delete_tmp = False 5256 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5257 5258 # Config 5259 config = self.get_config() 5260 log.debug("Config: " + str(config)) 5261 5262 # Config - Folders - Databases 5263 databases_folders = ( 5264 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5265 ) 5266 log.debug("Databases annotations: " + str(databases_folders)) 5267 5268 # Config - snpEff bin command 5269 snpeff_bin_command = get_bin_command( 5270 bin="snpEff.jar", 5271 tool="snpeff", 5272 bin_type="jar", 5273 config=config, 5274 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5275 ) 5276 if not snpeff_bin_command: 5277 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5278 log.error(msg_err) 5279 raise ValueError(msg_err) 5280 5281 # Config - snpEff databases 5282 snpeff_databases = ( 5283 config.get("folders", {}) 5284 .get("databases", {}) 5285 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5286 ) 5287 snpeff_databases = full_path(snpeff_databases) 5288 if snpeff_databases is not None and snpeff_databases != "": 5289 log.debug(f"Create snpEff databases folder") 5290 if not os.path.exists(snpeff_databases): 5291 os.makedirs(snpeff_databases) 5292 5293 # Param 5294 param = self.get_param() 5295 log.debug("Param: " + str(param)) 5296 5297 # Param 5298 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5299 log.debug("Options: " + str(options)) 5300 5301 # Param - Assembly 5302 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5303 5304 # Param - Options 5305 snpeff_options = ( 5306 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5307 ) 5308 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5309 snpeff_csvstats = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5311 ) 5312 if snpeff_stats: 5313 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5314 snpeff_stats = full_path(snpeff_stats) 5315 snpeff_options += f" -stats {snpeff_stats}" 5316 if snpeff_csvstats: 5317 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5318 snpeff_csvstats = full_path(snpeff_csvstats) 5319 snpeff_options += f" -csvStats {snpeff_csvstats}" 5320 5321 # Data 5322 table_variants = self.get_table_variants() 5323 5324 # Check if not empty 5325 log.debug("Check if not empty") 5326 sql_query_chromosomes = ( 5327 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5328 ) 5329 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5330 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5331 log.info(f"VCF empty") 5332 return 5333 5334 # Export in VCF 5335 log.debug("Create initial file to annotate") 5336 tmp_vcf = NamedTemporaryFile( 5337 prefix=self.get_prefix(), 5338 dir=self.get_tmp_dir(), 5339 suffix=".vcf.gz", 5340 delete=True, 5341 ) 5342 tmp_vcf_name = tmp_vcf.name 5343 5344 # VCF header 5345 vcf_reader = self.get_header() 5346 log.debug("Initial header: " + str(vcf_reader.infos)) 5347 5348 # Existing annotations 5349 for vcf_annotation in self.get_header().infos: 5350 5351 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5352 log.debug( 5353 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5354 ) 5355 5356 # Memory limit 5357 # if config.get("memory", None): 5358 # memory_limit = config.get("memory", "8G") 5359 # else: 5360 # memory_limit = "8G" 5361 memory_limit = self.get_memory("8G") 5362 log.debug(f"memory_limit: {memory_limit}") 5363 5364 # snpEff java options 5365 snpeff_java_options = ( 5366 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5367 ) 5368 log.debug(f"Exomiser java options: {snpeff_java_options}") 5369 5370 force_update_annotation = True 5371 5372 if "ANN" not in self.get_header().infos or force_update_annotation: 5373 5374 # Check snpEff database 5375 log.debug(f"Check snpEff databases {[assembly]}") 5376 databases_download_snpeff( 5377 folder=snpeff_databases, assemblies=[assembly], config=config 5378 ) 5379 5380 # Export VCF file 5381 self.export_variant_vcf( 5382 vcf_file=tmp_vcf_name, 5383 remove_info=True, 5384 add_samples=False, 5385 index=True, 5386 ) 5387 5388 # Tmp file 5389 err_files = [] 5390 tmp_annotate_vcf = NamedTemporaryFile( 5391 prefix=self.get_prefix(), 5392 dir=self.get_tmp_dir(), 5393 suffix=".vcf", 5394 delete=False, 5395 ) 5396 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5397 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5398 err_files.append(tmp_annotate_vcf_name_err) 5399 5400 # Command 5401 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5402 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5403 run_parallel_commands([snpeff_command], 1) 5404 5405 # Error messages 5406 log.info(f"Error/Warning messages:") 5407 error_message_command_all = [] 5408 error_message_command_warning = [] 5409 error_message_command_err = [] 5410 for err_file in err_files: 5411 with open(err_file, "r") as f: 5412 for line in f: 5413 message = line.strip() 5414 error_message_command_all.append(message) 5415 if line.startswith("[W::"): 5416 error_message_command_warning.append(message) 5417 if line.startswith("[E::"): 5418 error_message_command_err.append(f"{err_file}: " + message) 5419 # log info 5420 for message in list( 5421 set(error_message_command_err + error_message_command_warning) 5422 ): 5423 log.info(f" {message}") 5424 # debug info 5425 for message in list(set(error_message_command_all)): 5426 log.debug(f" {message}") 5427 # failed 5428 if len(error_message_command_err): 5429 log.error("Annotation failed: Error in commands") 5430 raise ValueError("Annotation failed: Error in commands") 5431 5432 # Find annotation in header 5433 with open(tmp_annotate_vcf_name, "rt") as f: 5434 header_list = self.read_vcf_header(f) 5435 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5436 5437 for ann in annovar_vcf_header.infos: 5438 if ann not in self.get_header().infos: 5439 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5440 5441 # Update variants 5442 log.info(f"Annotation - Updating...") 5443 self.update_from_vcf(tmp_annotate_vcf_name) 5444 5445 else: 5446 if "ANN" in self.get_header().infos: 5447 log.debug(f"Existing snpEff annotations in VCF") 5448 if force_update_annotation: 5449 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5450 5451 def annotation_annovar(self, threads: int = None) -> None: 5452 """ 5453 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5454 annotations 5455 5456 :param threads: number of threads to use 5457 :return: the value of the variable "return_value". 5458 """ 5459 5460 # DEBUG 5461 log.debug("Start annotation with Annovar databases") 5462 5463 # Threads 5464 if not threads: 5465 threads = self.get_threads() 5466 log.debug("Threads: " + str(threads)) 5467 5468 # Tmp en Err files 5469 tmp_files = [] 5470 err_files = [] 5471 5472 # DEBUG 5473 delete_tmp = True 5474 if self.get_config().get("verbosity", "warning") in ["debug"]: 5475 delete_tmp = False 5476 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5477 5478 # Config 5479 config = self.get_config() 5480 log.debug("Config: " + str(config)) 5481 5482 # Config - Folders - Databases 5483 databases_folders = ( 5484 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5485 ) 5486 log.debug("Databases annotations: " + str(databases_folders)) 5487 5488 # Config - annovar bin command 5489 annovar_bin_command = get_bin_command( 5490 bin="table_annovar.pl", 5491 tool="annovar", 5492 bin_type="perl", 5493 config=config, 5494 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5495 ) 5496 if not annovar_bin_command: 5497 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5498 log.error(msg_err) 5499 raise ValueError(msg_err) 5500 5501 # Config - BCFTools bin command 5502 bcftools_bin_command = get_bin_command( 5503 bin="bcftools", 5504 tool="bcftools", 5505 bin_type="bin", 5506 config=config, 5507 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5508 ) 5509 if not bcftools_bin_command: 5510 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5511 log.error(msg_err) 5512 raise ValueError(msg_err) 5513 5514 # Config - annovar databases 5515 annovar_databases = ( 5516 config.get("folders", {}) 5517 .get("databases", {}) 5518 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5519 ) 5520 if annovar_databases is not None: 5521 if isinstance(annovar_databases, list): 5522 annovar_databases = full_path(annovar_databases[0]) 5523 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5524 annovar_databases = full_path(annovar_databases) 5525 if not os.path.exists(annovar_databases): 5526 log.info(f"Annovar databases folder '{annovar_databases}' created") 5527 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5528 else: 5529 msg_err = f"Annovar databases configuration failed" 5530 log.error(msg_err) 5531 raise ValueError(msg_err) 5532 5533 # Param 5534 param = self.get_param() 5535 log.debug("Param: " + str(param)) 5536 5537 # Param - options 5538 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5539 log.debug("Options: " + str(options)) 5540 5541 # Param - annotations 5542 annotations = ( 5543 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5544 ) 5545 log.debug("Annotations: " + str(annotations)) 5546 5547 # Param - Assembly 5548 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5549 5550 # Annovar database assembly 5551 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5552 if annovar_databases_assembly != "" and not os.path.exists( 5553 annovar_databases_assembly 5554 ): 5555 os.makedirs(annovar_databases_assembly) 5556 5557 # Data 5558 table_variants = self.get_table_variants() 5559 5560 # Check if not empty 5561 log.debug("Check if not empty") 5562 sql_query_chromosomes = ( 5563 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5564 ) 5565 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5566 if not sql_query_chromosomes_df["count"][0]: 5567 log.info(f"VCF empty") 5568 return 5569 5570 # VCF header 5571 vcf_reader = self.get_header() 5572 log.debug("Initial header: " + str(vcf_reader.infos)) 5573 5574 # Existing annotations 5575 for vcf_annotation in self.get_header().infos: 5576 5577 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5578 log.debug( 5579 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5580 ) 5581 5582 force_update_annotation = True 5583 5584 if annotations: 5585 5586 commands = [] 5587 tmp_annotates_vcf_name_list = [] 5588 5589 # Export in VCF 5590 log.debug("Create initial file to annotate") 5591 tmp_vcf = NamedTemporaryFile( 5592 prefix=self.get_prefix(), 5593 dir=self.get_tmp_dir(), 5594 suffix=".vcf.gz", 5595 delete=False, 5596 ) 5597 tmp_vcf_name = tmp_vcf.name 5598 tmp_files.append(tmp_vcf_name) 5599 tmp_files.append(tmp_vcf_name + ".tbi") 5600 5601 # Export VCF file 5602 self.export_variant_vcf( 5603 vcf_file=tmp_vcf_name, 5604 remove_info=".", 5605 add_samples=False, 5606 index=True, 5607 ) 5608 5609 # Create file for field rename 5610 log.debug("Create file for field rename") 5611 tmp_rename = NamedTemporaryFile( 5612 prefix=self.get_prefix(), 5613 dir=self.get_tmp_dir(), 5614 suffix=".rename", 5615 delete=False, 5616 ) 5617 tmp_rename_name = tmp_rename.name 5618 tmp_files.append(tmp_rename_name) 5619 5620 # Check Annovar database 5621 log.debug( 5622 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5623 ) 5624 databases_download_annovar( 5625 folder=annovar_databases, 5626 files=list(annotations.keys()), 5627 assemblies=[assembly], 5628 ) 5629 5630 for annotation in annotations: 5631 annotation_fields = annotations[annotation] 5632 5633 if not annotation_fields: 5634 annotation_fields = {"INFO": None} 5635 5636 log.info(f"Annotations Annovar - database '{annotation}'") 5637 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5638 5639 # Tmp file for annovar 5640 err_files = [] 5641 tmp_annotate_vcf_directory = TemporaryDirectory( 5642 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5643 ) 5644 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5645 tmp_annotate_vcf_name_annovar = ( 5646 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5647 ) 5648 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5649 err_files.append(tmp_annotate_vcf_name_err) 5650 tmp_files.append(tmp_annotate_vcf_name_err) 5651 5652 # Tmp file final vcf annotated by annovar 5653 tmp_annotate_vcf = NamedTemporaryFile( 5654 prefix=self.get_prefix(), 5655 dir=self.get_tmp_dir(), 5656 suffix=".vcf.gz", 5657 delete=False, 5658 ) 5659 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5660 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5661 tmp_files.append(tmp_annotate_vcf_name) 5662 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5663 5664 # Number of fields 5665 annotation_list = [] 5666 annotation_renamed_list = [] 5667 5668 for annotation_field in annotation_fields: 5669 5670 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5671 annotation_fields_new_name = annotation_fields.get( 5672 annotation_field, annotation_field 5673 ) 5674 if not annotation_fields_new_name: 5675 annotation_fields_new_name = annotation_field 5676 5677 if ( 5678 force_update_annotation 5679 or annotation_fields_new_name not in self.get_header().infos 5680 ): 5681 annotation_list.append(annotation_field) 5682 annotation_renamed_list.append(annotation_fields_new_name) 5683 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5684 log.warning( 5685 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5686 ) 5687 5688 # Add rename info 5689 run_parallel_commands( 5690 [ 5691 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5692 ], 5693 1, 5694 ) 5695 5696 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5697 log.debug("annotation_list: " + str(annotation_list)) 5698 5699 # protocol 5700 protocol = annotation 5701 5702 # argument 5703 argument = "" 5704 5705 # operation 5706 operation = "f" 5707 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5708 "ensGene" 5709 ): 5710 operation = "g" 5711 if options.get("genebase", None): 5712 argument = f"""'{options.get("genebase","")}'""" 5713 elif annotation in ["cytoBand"]: 5714 operation = "r" 5715 5716 # argument option 5717 argument_option = "" 5718 if argument != "": 5719 argument_option = " --argument " + argument 5720 5721 # command options 5722 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5723 for option in options: 5724 if option not in ["genebase"]: 5725 command_options += f""" --{option}={options[option]}""" 5726 5727 # Command 5728 5729 # Command - Annovar 5730 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5731 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5732 5733 # Command - start pipe 5734 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5735 5736 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5737 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5738 5739 # Command - Special characters (refGene annotation) 5740 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5741 5742 # Command - Clean empty fields (with value ".") 5743 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5744 5745 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5746 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5747 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5748 # for ann in annotation_renamed_list: 5749 for ann in annotation_list: 5750 annovar_fields_to_keep.append(f"^INFO/{ann}") 5751 5752 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5753 5754 # Command - indexing 5755 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5756 5757 log.debug(f"Annotation - Annovar command: {command_annovar}") 5758 run_parallel_commands([command_annovar], 1) 5759 5760 # Error messages 5761 log.info(f"Error/Warning messages:") 5762 error_message_command_all = [] 5763 error_message_command_warning = [] 5764 error_message_command_err = [] 5765 for err_file in err_files: 5766 with open(err_file, "r") as f: 5767 for line in f: 5768 message = line.strip() 5769 error_message_command_all.append(message) 5770 if line.startswith("[W::") or line.startswith("WARNING"): 5771 error_message_command_warning.append(message) 5772 if line.startswith("[E::") or line.startswith("ERROR"): 5773 error_message_command_err.append( 5774 f"{err_file}: " + message 5775 ) 5776 # log info 5777 for message in list( 5778 set(error_message_command_err + error_message_command_warning) 5779 ): 5780 log.info(f" {message}") 5781 # debug info 5782 for message in list(set(error_message_command_all)): 5783 log.debug(f" {message}") 5784 # failed 5785 if len(error_message_command_err): 5786 log.error("Annotation failed: Error in commands") 5787 raise ValueError("Annotation failed: Error in commands") 5788 5789 if tmp_annotates_vcf_name_list: 5790 5791 # List of annotated files 5792 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5793 5794 # Tmp file 5795 tmp_annotate_vcf = NamedTemporaryFile( 5796 prefix=self.get_prefix(), 5797 dir=self.get_tmp_dir(), 5798 suffix=".vcf.gz", 5799 delete=False, 5800 ) 5801 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5802 tmp_files.append(tmp_annotate_vcf_name) 5803 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5804 err_files.append(tmp_annotate_vcf_name_err) 5805 tmp_files.append(tmp_annotate_vcf_name_err) 5806 5807 # Command merge 5808 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5809 log.info( 5810 f"Annotation Annovar - Annotation merging " 5811 + str(len(tmp_annotates_vcf_name_list)) 5812 + " annotated files" 5813 ) 5814 log.debug(f"Annotation - merge command: {merge_command}") 5815 run_parallel_commands([merge_command], 1) 5816 5817 # Find annotation in header 5818 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5819 header_list = self.read_vcf_header(f) 5820 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5821 5822 for ann in annovar_vcf_header.infos: 5823 if ann not in self.get_header().infos: 5824 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5825 5826 # Update variants 5827 log.info(f"Annotation Annovar - Updating...") 5828 self.update_from_vcf(tmp_annotate_vcf_name) 5829 5830 # Clean files 5831 # Tmp file remove command 5832 if True: 5833 tmp_files_remove_command = "" 5834 if tmp_files: 5835 tmp_files_remove_command = " ".join(tmp_files) 5836 clean_command = f" rm -f {tmp_files_remove_command} " 5837 log.debug(f"Annotation Annovar - Annotation cleaning ") 5838 log.debug(f"Annotation - cleaning command: {clean_command}") 5839 run_parallel_commands([clean_command], 1) 5840 5841 # Parquet 5842 def annotation_parquet(self, threads: int = None) -> None: 5843 """ 5844 It takes a VCF file, and annotates it with a parquet file 5845 5846 :param threads: number of threads to use for the annotation 5847 :return: the value of the variable "result". 5848 """ 5849 5850 # DEBUG 5851 log.debug("Start annotation with parquet databases") 5852 5853 # Threads 5854 if not threads: 5855 threads = self.get_threads() 5856 log.debug("Threads: " + str(threads)) 5857 5858 # DEBUG 5859 delete_tmp = True 5860 if self.get_config().get("verbosity", "warning") in ["debug"]: 5861 delete_tmp = False 5862 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5863 5864 # Config 5865 databases_folders = set( 5866 self.get_config() 5867 .get("folders", {}) 5868 .get("databases", {}) 5869 .get("annotations", ["."]) 5870 + self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("parquet", ["."]) 5874 ) 5875 log.debug("Databases annotations: " + str(databases_folders)) 5876 5877 # Param 5878 annotations = ( 5879 self.get_param() 5880 .get("annotation", {}) 5881 .get("parquet", {}) 5882 .get("annotations", None) 5883 ) 5884 log.debug("Annotations: " + str(annotations)) 5885 5886 # Assembly 5887 assembly = self.get_param().get( 5888 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5889 ) 5890 5891 # Force Update Annotation 5892 force_update_annotation = ( 5893 self.get_param() 5894 .get("annotation", {}) 5895 .get("options", {}) 5896 .get("annotations_update", False) 5897 ) 5898 log.debug(f"force_update_annotation={force_update_annotation}") 5899 force_append_annotation = ( 5900 self.get_param() 5901 .get("annotation", {}) 5902 .get("options", {}) 5903 .get("annotations_append", False) 5904 ) 5905 log.debug(f"force_append_annotation={force_append_annotation}") 5906 5907 # Data 5908 table_variants = self.get_table_variants() 5909 5910 # Check if not empty 5911 log.debug("Check if not empty") 5912 sql_query_chromosomes_df = self.get_query_to_df( 5913 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5914 ) 5915 if not sql_query_chromosomes_df["count"][0]: 5916 log.info(f"VCF empty") 5917 return 5918 5919 # VCF header 5920 vcf_reader = self.get_header() 5921 log.debug("Initial header: " + str(vcf_reader.infos)) 5922 5923 # Nb Variants POS 5924 log.debug("NB Variants Start") 5925 nb_variants = self.conn.execute( 5926 f"SELECT count(*) AS count FROM variants" 5927 ).fetchdf()["count"][0] 5928 log.debug("NB Variants Stop") 5929 5930 # Existing annotations 5931 for vcf_annotation in self.get_header().infos: 5932 5933 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5934 log.debug( 5935 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5936 ) 5937 5938 # Added columns 5939 added_columns = [] 5940 5941 # drop indexes 5942 log.debug(f"Drop indexes...") 5943 self.drop_indexes() 5944 5945 if annotations: 5946 5947 if "ALL" in annotations: 5948 5949 all_param = annotations.get("ALL", {}) 5950 all_param_formats = all_param.get("formats", None) 5951 all_param_releases = all_param.get("releases", None) 5952 5953 databases_infos_dict = self.scan_databases( 5954 database_formats=all_param_formats, 5955 database_releases=all_param_releases, 5956 ) 5957 for database_infos in databases_infos_dict.keys(): 5958 if database_infos not in annotations: 5959 annotations[database_infos] = {"INFO": None} 5960 5961 for annotation in annotations: 5962 5963 if annotation in ["ALL"]: 5964 continue 5965 5966 # Annotation Name 5967 annotation_name = os.path.basename(annotation) 5968 5969 # Annotation fields 5970 annotation_fields = annotations[annotation] 5971 if not annotation_fields: 5972 annotation_fields = {"INFO": None} 5973 5974 log.debug(f"Annotation '{annotation_name}'") 5975 log.debug( 5976 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5977 ) 5978 5979 # Create Database 5980 database = Database( 5981 database=annotation, 5982 databases_folders=databases_folders, 5983 assembly=assembly, 5984 ) 5985 5986 # Find files 5987 parquet_file = database.get_database() 5988 parquet_hdr_file = database.get_header_file() 5989 parquet_type = database.get_type() 5990 5991 # Check if files exists 5992 if not parquet_file or not parquet_hdr_file: 5993 msg_err_list = [] 5994 if not parquet_file: 5995 msg_err_list.append( 5996 f"Annotation failed: Annotation file not found" 5997 ) 5998 if parquet_file and not parquet_hdr_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6001 ) 6002 6003 log.error(". ".join(msg_err_list)) 6004 raise ValueError(". ".join(msg_err_list)) 6005 else: 6006 # Get parquet connexion 6007 parquet_sql_attach = database.get_sql_database_attach( 6008 output="query" 6009 ) 6010 if parquet_sql_attach: 6011 self.conn.execute(parquet_sql_attach) 6012 parquet_file_link = database.get_sql_database_link() 6013 # Log 6014 log.debug( 6015 f"Annotation '{annotation_name}' - file: " 6016 + str(parquet_file) 6017 + " and " 6018 + str(parquet_hdr_file) 6019 ) 6020 6021 # Database full header columns 6022 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6023 parquet_hdr_file 6024 ) 6025 # Log 6026 log.debug( 6027 "Annotation database header columns : " 6028 + str(parquet_hdr_vcf_header_columns) 6029 ) 6030 6031 # Load header as VCF object 6032 parquet_hdr_vcf_header_infos = database.get_header().infos 6033 # Log 6034 log.debug( 6035 "Annotation database header: " 6036 + str(parquet_hdr_vcf_header_infos) 6037 ) 6038 6039 # Get extra infos 6040 parquet_columns = database.get_extra_columns() 6041 # Log 6042 log.debug("Annotation database Columns: " + str(parquet_columns)) 6043 6044 # Add extra columns if "ALL" in annotation_fields 6045 # if "ALL" in annotation_fields: 6046 # allow_add_extra_column = True 6047 if "ALL" in annotation_fields and database.get_extra_columns(): 6048 for extra_column in database.get_extra_columns(): 6049 if ( 6050 extra_column not in annotation_fields 6051 and extra_column.replace("INFO/", "") 6052 not in parquet_hdr_vcf_header_infos 6053 ): 6054 parquet_hdr_vcf_header_infos[extra_column] = ( 6055 vcf.parser._Info( 6056 extra_column, 6057 ".", 6058 "String", 6059 f"{extra_column} description", 6060 "unknown", 6061 "unknown", 6062 self.code_type_map["String"], 6063 ) 6064 ) 6065 6066 # For all fields in database 6067 annotation_fields_all = False 6068 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6069 annotation_fields_all = True 6070 annotation_fields = { 6071 key: key for key in parquet_hdr_vcf_header_infos 6072 } 6073 6074 log.debug( 6075 "Annotation database header - All annotations added: " 6076 + str(annotation_fields) 6077 ) 6078 6079 # Init 6080 6081 # List of annotation fields to use 6082 sql_query_annotation_update_info_sets = [] 6083 6084 # List of annotation to agregate 6085 sql_query_annotation_to_agregate = [] 6086 6087 # Number of fields 6088 nb_annotation_field = 0 6089 6090 # Annotation fields processed 6091 annotation_fields_processed = [] 6092 6093 # Columns mapping 6094 map_columns = database.map_columns( 6095 columns=annotation_fields, prefixes=["INFO/"] 6096 ) 6097 6098 # Query dict for fields to remove (update option) 6099 query_dict_remove = {} 6100 6101 # Fetch Anotation fields 6102 for annotation_field in annotation_fields: 6103 6104 # annotation_field_column 6105 annotation_field_column = map_columns.get( 6106 annotation_field, "INFO" 6107 ) 6108 6109 # field new name, if parametered 6110 annotation_fields_new_name = annotation_fields.get( 6111 annotation_field, annotation_field 6112 ) 6113 if not annotation_fields_new_name: 6114 annotation_fields_new_name = annotation_field 6115 6116 # To annotate 6117 # force_update_annotation = True 6118 # force_append_annotation = True 6119 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6120 if annotation_field in parquet_hdr_vcf_header_infos and ( 6121 force_update_annotation 6122 or force_append_annotation 6123 or ( 6124 annotation_fields_new_name 6125 not in self.get_header().infos 6126 ) 6127 ): 6128 6129 # Add field to annotation to process list 6130 annotation_fields_processed.append( 6131 annotation_fields_new_name 6132 ) 6133 6134 # explode infos for the field 6135 annotation_fields_new_name_info_msg = "" 6136 if ( 6137 force_update_annotation 6138 and annotation_fields_new_name 6139 in self.get_header().infos 6140 ): 6141 # Remove field from INFO 6142 query = f""" 6143 UPDATE {table_variants} as table_variants 6144 SET INFO = REGEXP_REPLACE( 6145 concat(table_variants.INFO,''), 6146 ';*{annotation_fields_new_name}=[^;]*', 6147 '' 6148 ) 6149 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6150 """ 6151 annotation_fields_new_name_info_msg = " [update]" 6152 query_dict_remove[ 6153 f"remove 'INFO/{annotation_fields_new_name}'" 6154 ] = query 6155 6156 # Sep between fields in INFO 6157 nb_annotation_field += 1 6158 if nb_annotation_field > 1: 6159 annotation_field_sep = ";" 6160 else: 6161 annotation_field_sep = "" 6162 6163 log.info( 6164 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6165 ) 6166 6167 # Add INFO field to header 6168 parquet_hdr_vcf_header_infos_number = ( 6169 parquet_hdr_vcf_header_infos[annotation_field].num 6170 or "." 6171 ) 6172 parquet_hdr_vcf_header_infos_type = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].type 6174 or "String" 6175 ) 6176 parquet_hdr_vcf_header_infos_description = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].desc 6178 or f"{annotation_field} description" 6179 ) 6180 parquet_hdr_vcf_header_infos_source = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].source 6182 or "unknown" 6183 ) 6184 parquet_hdr_vcf_header_infos_version = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].version 6186 or "unknown" 6187 ) 6188 6189 vcf_reader.infos[annotation_fields_new_name] = ( 6190 vcf.parser._Info( 6191 annotation_fields_new_name, 6192 parquet_hdr_vcf_header_infos_number, 6193 parquet_hdr_vcf_header_infos_type, 6194 parquet_hdr_vcf_header_infos_description, 6195 parquet_hdr_vcf_header_infos_source, 6196 parquet_hdr_vcf_header_infos_version, 6197 self.code_type_map[ 6198 parquet_hdr_vcf_header_infos_type 6199 ], 6200 ) 6201 ) 6202 6203 # Append 6204 if force_append_annotation: 6205 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6206 else: 6207 query_case_when_append = "" 6208 6209 # Annotation/Update query fields 6210 # Found in INFO column 6211 if ( 6212 annotation_field_column == "INFO" 6213 and "INFO" in parquet_hdr_vcf_header_columns 6214 ): 6215 sql_query_annotation_update_info_sets.append( 6216 f""" 6217 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6218 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6219 ELSE '' 6220 END 6221 """ 6222 ) 6223 # Found in a specific column 6224 else: 6225 sql_query_annotation_update_info_sets.append( 6226 f""" 6227 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6228 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6229 ELSE '' 6230 END 6231 """ 6232 ) 6233 sql_query_annotation_to_agregate.append( 6234 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6235 ) 6236 6237 # Not to annotate 6238 else: 6239 6240 if force_update_annotation: 6241 annotation_message = "forced" 6242 else: 6243 annotation_message = "skipped" 6244 6245 if annotation_field not in parquet_hdr_vcf_header_infos: 6246 log.warning( 6247 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6248 ) 6249 if annotation_fields_new_name in self.get_header().infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6252 ) 6253 6254 # Check if ALL fields have to be annotated. Thus concat all INFO field 6255 # allow_annotation_full_info = True 6256 allow_annotation_full_info = not force_append_annotation 6257 6258 if parquet_type in ["regions"]: 6259 allow_annotation_full_info = False 6260 6261 if ( 6262 allow_annotation_full_info 6263 and nb_annotation_field == len(annotation_fields) 6264 and annotation_fields_all 6265 and ( 6266 "INFO" in parquet_hdr_vcf_header_columns 6267 and "INFO" in database.get_extra_columns() 6268 ) 6269 ): 6270 log.debug("Column INFO annotation enabled") 6271 sql_query_annotation_update_info_sets = [] 6272 sql_query_annotation_update_info_sets.append( 6273 f" table_parquet.INFO " 6274 ) 6275 6276 if sql_query_annotation_update_info_sets: 6277 6278 # Annotate 6279 log.info(f"Annotation '{annotation_name}' - Annotation...") 6280 6281 # Join query annotation update info sets for SQL 6282 sql_query_annotation_update_info_sets_sql = ",".join( 6283 sql_query_annotation_update_info_sets 6284 ) 6285 6286 # Check chromosomes list (and variants infos) 6287 sql_query_chromosomes = f""" 6288 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6289 FROM {table_variants} as table_variants 6290 GROUP BY table_variants."#CHROM" 6291 ORDER BY table_variants."#CHROM" 6292 """ 6293 sql_query_chromosomes_df = self.conn.execute( 6294 sql_query_chromosomes 6295 ).df() 6296 sql_query_chromosomes_dict = { 6297 entry["CHROM"]: { 6298 "count": entry["count_variants"], 6299 "min": entry["min_variants"], 6300 "max": entry["max_variants"], 6301 } 6302 for index, entry in sql_query_chromosomes_df.iterrows() 6303 } 6304 6305 # Init 6306 nb_of_query = 0 6307 nb_of_variant_annotated = 0 6308 query_dict = query_dict_remove 6309 6310 # for chrom in sql_query_chromosomes_df["CHROM"]: 6311 for chrom in sql_query_chromosomes_dict: 6312 6313 # Number of variant by chromosome 6314 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6315 chrom, {} 6316 ).get("count", 0) 6317 6318 log.debug( 6319 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6320 ) 6321 6322 # Annotation with regions database 6323 if parquet_type in ["regions"]: 6324 sql_query_annotation_from_clause = f""" 6325 FROM ( 6326 SELECT 6327 '{chrom}' AS \"#CHROM\", 6328 table_variants_from.\"POS\" AS \"POS\", 6329 {",".join(sql_query_annotation_to_agregate)} 6330 FROM {table_variants} as table_variants_from 6331 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6332 table_parquet_from."#CHROM" = '{chrom}' 6333 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6334 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6335 ) 6336 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6337 GROUP BY table_variants_from.\"POS\" 6338 ) 6339 as table_parquet 6340 """ 6341 6342 sql_query_annotation_where_clause = """ 6343 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6344 AND table_parquet.\"POS\" = table_variants.\"POS\" 6345 """ 6346 6347 # Annotation with variants database 6348 else: 6349 sql_query_annotation_from_clause = f""" 6350 FROM {parquet_file_link} as table_parquet 6351 """ 6352 sql_query_annotation_where_clause = f""" 6353 table_variants."#CHROM" = '{chrom}' 6354 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6355 AND table_parquet.\"POS\" = table_variants.\"POS\" 6356 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6357 AND table_parquet.\"REF\" = table_variants.\"REF\" 6358 """ 6359 6360 # Create update query 6361 sql_query_annotation_chrom_interval_pos = f""" 6362 UPDATE {table_variants} as table_variants 6363 SET INFO = 6364 concat( 6365 CASE WHEN table_variants.INFO NOT IN ('','.') 6366 THEN table_variants.INFO 6367 ELSE '' 6368 END 6369 , 6370 CASE WHEN table_variants.INFO NOT IN ('','.') 6371 AND ( 6372 concat({sql_query_annotation_update_info_sets_sql}) 6373 ) 6374 NOT IN ('','.') 6375 THEN ';' 6376 ELSE '' 6377 END 6378 , 6379 {sql_query_annotation_update_info_sets_sql} 6380 ) 6381 {sql_query_annotation_from_clause} 6382 WHERE {sql_query_annotation_where_clause} 6383 ; 6384 """ 6385 6386 # Add update query to dict 6387 query_dict[ 6388 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6389 ] = sql_query_annotation_chrom_interval_pos 6390 6391 nb_of_query = len(query_dict) 6392 num_query = 0 6393 6394 # SET max_expression_depth TO x 6395 self.conn.execute("SET max_expression_depth TO 10000") 6396 6397 for query_name in query_dict: 6398 query = query_dict[query_name] 6399 num_query += 1 6400 log.info( 6401 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6402 ) 6403 result = self.conn.execute(query) 6404 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6405 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6406 log.info( 6407 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6408 ) 6409 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6412 ) 6413 6414 else: 6415 6416 log.info( 6417 f"Annotation '{annotation_name}' - No Annotations available" 6418 ) 6419 6420 log.debug("Final header: " + str(vcf_reader.infos)) 6421 6422 # Remove added columns 6423 for added_column in added_columns: 6424 self.drop_column(column=added_column) 6425 6426 def annotation_splice(self, threads: int = None) -> None: 6427 """ 6428 This function annotate with snpEff 6429 6430 :param threads: The number of threads to use 6431 :return: the value of the variable "return_value". 6432 """ 6433 6434 # DEBUG 6435 log.debug("Start annotation with splice tools") 6436 6437 # Threads 6438 if not threads: 6439 threads = self.get_threads() 6440 log.debug("Threads: " + str(threads)) 6441 6442 # DEBUG 6443 delete_tmp = True 6444 if self.get_config().get("verbosity", "warning") in ["debug"]: 6445 delete_tmp = False 6446 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6447 6448 # Config 6449 config = self.get_config() 6450 log.debug("Config: " + str(config)) 6451 splice_config = config.get("tools", {}).get("splice", {}) 6452 if not splice_config: 6453 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6454 msg_err = "No Splice tool config" 6455 raise ValueError(msg_err) 6456 log.debug(f"splice_config: {splice_config}") 6457 6458 # Config - Folders - Databases 6459 databases_folders = ( 6460 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6461 ) 6462 log.debug("Databases annotations: " + str(databases_folders)) 6463 6464 # Splice docker image 6465 splice_docker_image = splice_config.get("docker").get("image") 6466 6467 # Pull splice image if it's not already there 6468 if not check_docker_image_exists(splice_docker_image): 6469 log.warning( 6470 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6471 ) 6472 try: 6473 command(f"docker pull {splice_config.get('docker').get('image')}") 6474 except subprocess.CalledProcessError: 6475 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6476 log.error(msg_err) 6477 raise ValueError(msg_err) 6478 6479 # Config - splice databases 6480 splice_databases = ( 6481 config.get("folders", {}) 6482 .get("databases", {}) 6483 .get("splice", DEFAULT_SPLICE_FOLDER) 6484 ) 6485 splice_databases = full_path(splice_databases) 6486 6487 # Param 6488 param = self.get_param() 6489 log.debug("Param: " + str(param)) 6490 6491 # Param 6492 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6493 log.debug("Options: " + str(options)) 6494 6495 # Data 6496 table_variants = self.get_table_variants() 6497 6498 # Check if not empty 6499 log.debug("Check if not empty") 6500 sql_query_chromosomes = ( 6501 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6502 ) 6503 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6504 log.info("VCF empty") 6505 return None 6506 6507 # Export in VCF 6508 log.debug("Create initial file to annotate") 6509 6510 # Create output folder / work folder 6511 if options.get("output_folder", ""): 6512 output_folder = options.get("output_folder", "") 6513 if not os.path.exists(output_folder): 6514 Path(output_folder).mkdir(parents=True, exist_ok=True) 6515 else: 6516 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 6520 if options.get("workdir", ""): 6521 workdir = options.get("workdir", "") 6522 else: 6523 workdir = "/work" 6524 6525 # Create tmp VCF file 6526 tmp_vcf = NamedTemporaryFile( 6527 prefix=self.get_prefix(), 6528 dir=output_folder, 6529 suffix=".vcf", 6530 delete=False, 6531 ) 6532 tmp_vcf_name = tmp_vcf.name 6533 6534 # VCF header 6535 header = self.get_header() 6536 6537 # Existing annotations 6538 for vcf_annotation in self.get_header().infos: 6539 6540 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6541 log.debug( 6542 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6543 ) 6544 6545 # Memory limit 6546 if config.get("memory", None): 6547 memory_limit = config.get("memory", "8G").upper() 6548 # upper() 6549 else: 6550 memory_limit = "8G" 6551 log.debug(f"memory_limit: {memory_limit}") 6552 6553 # Check number of variants to annotate 6554 where_clause_regex_spliceai = r"SpliceAI_\w+" 6555 where_clause_regex_spip = r"SPiP_\w+" 6556 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6557 df_list_of_variants_to_annotate = self.get_query_to_df( 6558 query=f""" SELECT * FROM variants {where_clause} """ 6559 ) 6560 if len(df_list_of_variants_to_annotate) == 0: 6561 log.warning( 6562 f"No variants to annotate with splice. Variants probably already annotated with splice" 6563 ) 6564 return None 6565 else: 6566 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6567 6568 # Export VCF file 6569 self.export_variant_vcf( 6570 vcf_file=tmp_vcf_name, 6571 remove_info=True, 6572 add_samples=True, 6573 index=False, 6574 where_clause=where_clause, 6575 ) 6576 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6577 if any(value for value in splice_config.values() if value is None): 6578 log.warning("At least one splice config parameter is empty") 6579 # exit annotation_splice 6580 return None 6581 6582 # Params in splice nf 6583 def check_values(dico: dict): 6584 """ 6585 Ensure parameters for NF splice pipeline 6586 """ 6587 for key, val in dico.items(): 6588 if key == "genome": 6589 if any( 6590 assemb in options.get("genome", {}) 6591 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6592 ): 6593 yield f"--{key} hg19" 6594 elif any( 6595 assemb in options.get("genome", {}) 6596 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6597 ): 6598 yield f"--{key} hg38" 6599 elif ( 6600 (isinstance(val, str) and val) 6601 or isinstance(val, int) 6602 or isinstance(val, bool) 6603 ): 6604 yield f"--{key} {val}" 6605 6606 # Genome 6607 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6608 options["genome"] = genome 6609 # NF params 6610 nf_params = [] 6611 # Add options 6612 if options: 6613 log.debug(options) 6614 nf_params = list(check_values(options)) 6615 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6616 else: 6617 log.debug("No NF params provided") 6618 # Add threads 6619 if "threads" not in options.keys(): 6620 nf_params.append(f"--threads {threads}") 6621 # Genome path 6622 genome_path = find_genome( 6623 config.get("folders", {}) 6624 .get("databases", {}) 6625 .get("genomes", DEFAULT_GENOME_FOLDER), 6626 file=f"{genome}.fa", 6627 ) 6628 # Add genome path 6629 if not genome_path: 6630 raise ValueError( 6631 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6632 ) 6633 else: 6634 log.debug(f"Genome: {genome_path}") 6635 nf_params.append(f"--genome_path {genome_path}") 6636 6637 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6638 """ 6639 Setting up updated databases for SPiP and SpliceAI 6640 """ 6641 6642 try: 6643 6644 # SpliceAI assembly transcriptome 6645 spliceai_assembly = os.path.join( 6646 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6647 options.get("genome"), 6648 "transcriptome", 6649 ) 6650 spip_assembly = options.get("genome") 6651 6652 spip = find( 6653 f"transcriptome_{spip_assembly}.RData", 6654 config.get("folders", {}).get("databases", {}).get("spip", {}), 6655 ) 6656 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6657 log.debug(f"SPiP annotations: {spip}") 6658 log.debug(f"SpliceAI annotations: {spliceai}") 6659 if spip and spliceai: 6660 return [ 6661 f"--spip_transcriptome {spip}", 6662 f"--spliceai_transcriptome {spliceai}", 6663 ] 6664 else: 6665 log.warning( 6666 "Can't find splice databases in configuration, use annotations file from image" 6667 ) 6668 except TypeError: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 return [] 6673 6674 # Add options, check if transcriptome option have already beend provided 6675 if ( 6676 "spip_transcriptome" not in nf_params 6677 and "spliceai_transcriptome" not in nf_params 6678 ): 6679 splice_reference = splice_annotations(options, config) 6680 if splice_reference: 6681 nf_params.extend(splice_reference) 6682 # nf_params.append(f"--output_folder {output_folder}") 6683 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6684 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6685 log.debug(cmd) 6686 splice_config["docker"]["command"] = cmd 6687 6688 # Ensure proxy is set 6689 proxy = [ 6690 f"-e {var}={os.getenv(var)}" 6691 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6692 if os.getenv(var) is not None 6693 ] 6694 docker_cmd = get_bin_command( 6695 tool="splice", 6696 bin_type="docker", 6697 config=config, 6698 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6699 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6700 ) 6701 # print(docker_cmd) 6702 # exit() 6703 # Docker debug 6704 # if splice_config.get("rm_container"): 6705 # rm_container = "--rm" 6706 # else: 6707 # rm_container = "" 6708 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6709 log.debug(docker_cmd) 6710 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6711 log.debug(res.stdout) 6712 if res.stderr: 6713 log.error(res.stderr) 6714 res.check_returncode() 6715 # Update variants 6716 log.info("Annotation - Updating...") 6717 # Test find output vcf 6718 log.debug( 6719 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6720 ) 6721 output_vcf = [] 6722 # Wrong folder to look in 6723 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6724 if ( 6725 files 6726 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6727 ): 6728 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6729 # log.debug(os.listdir(options.get("output_folder"))) 6730 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6731 if not output_vcf: 6732 log.debug( 6733 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6734 ) 6735 else: 6736 # Get new header from annotated vcf 6737 log.debug(f"Initial header: {len(header.infos)} fields") 6738 # Create new header with splice infos 6739 new_vcf = Variants(input=output_vcf[0]) 6740 new_vcf_header = new_vcf.get_header().infos 6741 for keys, infos in new_vcf_header.items(): 6742 if keys not in header.infos.keys(): 6743 header.infos[keys] = infos 6744 log.debug(f"New header: {len(header.infos)} fields") 6745 log.debug(f"Splice tmp output: {output_vcf[0]}") 6746 self.update_from_vcf(output_vcf[0]) 6747 6748 # Remove file 6749 remove_if_exists(output_vcf) 6750 6751 ### 6752 # Prioritization 6753 ### 6754 6755 def get_config_default(self, name: str) -> dict: 6756 """ 6757 The function `get_config_default` returns a dictionary containing default configurations for 6758 various calculations and prioritizations. 6759 6760 :param name: The `get_config_default` function returns a dictionary containing default 6761 configurations for different calculations and prioritizations. The `name` parameter is used to 6762 specify which specific configuration to retrieve from the dictionary 6763 :type name: str 6764 :return: The function `get_config_default` returns a dictionary containing default configuration 6765 settings for different calculations and prioritizations. The specific configuration settings are 6766 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6767 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6768 returned. If there is no match, an empty dictionary is returned. 6769 """ 6770 6771 config_default = { 6772 "calculations": { 6773 "variant_chr_pos_alt_ref": { 6774 "type": "sql", 6775 "name": "variant_chr_pos_alt_ref", 6776 "description": "Create a variant ID with chromosome, position, alt and ref", 6777 "available": False, 6778 "output_column_name": "variant_chr_pos_alt_ref", 6779 "output_column_type": "String", 6780 "output_column_description": "variant ID with chromosome, position, alt and ref", 6781 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6782 "operation_info": True, 6783 }, 6784 "VARTYPE": { 6785 "type": "sql", 6786 "name": "VARTYPE", 6787 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6788 "available": True, 6789 "table": "variants", 6790 "output_column_name": "VARTYPE", 6791 "output_column_type": "String", 6792 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6793 "operation_query": """ 6794 CASE 6795 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6796 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6797 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6798 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6799 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6800 ELSE 'UNDEFINED' 6801 END 6802 """, 6803 "info_fields": ["SVTYPE"], 6804 "operation_info": True, 6805 }, 6806 "snpeff_hgvs": { 6807 "type": "python", 6808 "name": "snpeff_hgvs", 6809 "description": "HGVS nomenclatures from snpEff annotation", 6810 "available": True, 6811 "function_name": "calculation_extract_snpeff_hgvs", 6812 "function_params": ["snpeff_hgvs", "ANN"], 6813 }, 6814 "snpeff_ann_explode": { 6815 "type": "python", 6816 "name": "snpeff_ann_explode", 6817 "description": "Explode snpEff annotations with uniquify values", 6818 "available": True, 6819 "function_name": "calculation_snpeff_ann_explode", 6820 "function_params": [False, "fields", "snpeff_", "ANN"], 6821 }, 6822 "snpeff_ann_explode_uniquify": { 6823 "type": "python", 6824 "name": "snpeff_ann_explode_uniquify", 6825 "description": "Explode snpEff annotations", 6826 "available": True, 6827 "function_name": "calculation_snpeff_ann_explode", 6828 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6829 }, 6830 "snpeff_ann_explode_json": { 6831 "type": "python", 6832 "name": "snpeff_ann_explode_json", 6833 "description": "Explode snpEff annotations in JSON format", 6834 "available": True, 6835 "function_name": "calculation_snpeff_ann_explode", 6836 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6837 }, 6838 "NOMEN": { 6839 "type": "python", 6840 "name": "NOMEN", 6841 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6842 "available": True, 6843 "function_name": "calculation_extract_nomen", 6844 "function_params": [], 6845 }, 6846 "RENAME_INFO_FIELDS": { 6847 "type": "python", 6848 "name": "RENAME_INFO_FIELDS", 6849 "description": "Rename or remove INFO/tags", 6850 "available": True, 6851 "function_name": "calculation_rename_info_fields", 6852 "function_params": [], 6853 }, 6854 "FINDBYPIPELINE": { 6855 "type": "python", 6856 "name": "FINDBYPIPELINE", 6857 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6858 "available": True, 6859 "function_name": "calculation_find_by_pipeline", 6860 "function_params": ["findbypipeline"], 6861 }, 6862 "FINDBYSAMPLE": { 6863 "type": "python", 6864 "name": "FINDBYSAMPLE", 6865 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6866 "available": True, 6867 "function_name": "calculation_find_by_pipeline", 6868 "function_params": ["findbysample"], 6869 }, 6870 "GENOTYPECONCORDANCE": { 6871 "type": "python", 6872 "name": "GENOTYPECONCORDANCE", 6873 "description": "Concordance of genotype for multi caller VCF", 6874 "available": True, 6875 "function_name": "calculation_genotype_concordance", 6876 "function_params": [], 6877 }, 6878 "BARCODE": { 6879 "type": "python", 6880 "name": "BARCODE", 6881 "description": "BARCODE as VaRank tool", 6882 "available": True, 6883 "function_name": "calculation_barcode", 6884 "function_params": [], 6885 }, 6886 "BARCODEFAMILY": { 6887 "type": "python", 6888 "name": "BARCODEFAMILY", 6889 "description": "BARCODEFAMILY as VaRank tool", 6890 "available": True, 6891 "function_name": "calculation_barcode_family", 6892 "function_params": ["BCF"], 6893 }, 6894 "TRIO": { 6895 "type": "python", 6896 "name": "TRIO", 6897 "description": "Inheritance for a trio family", 6898 "available": True, 6899 "function_name": "calculation_trio", 6900 "function_params": [], 6901 }, 6902 "VAF": { 6903 "type": "python", 6904 "name": "VAF", 6905 "description": "Variant Allele Frequency (VAF) harmonization", 6906 "available": True, 6907 "function_name": "calculation_vaf_normalization", 6908 "function_params": [], 6909 }, 6910 "VAF_stats": { 6911 "type": "python", 6912 "name": "VAF_stats", 6913 "description": "Variant Allele Frequency (VAF) statistics", 6914 "available": True, 6915 "function_name": "calculation_genotype_stats", 6916 "function_params": ["VAF"], 6917 }, 6918 "DP_stats": { 6919 "type": "python", 6920 "name": "DP_stats", 6921 "description": "Depth (DP) statistics", 6922 "available": True, 6923 "function_name": "calculation_genotype_stats", 6924 "function_params": ["DP"], 6925 }, 6926 "variant_id": { 6927 "type": "python", 6928 "name": "variant_id", 6929 "description": "Variant ID generated from variant position and type", 6930 "available": True, 6931 "function_name": "calculation_variant_id", 6932 "function_params": [], 6933 }, 6934 "transcripts_json": { 6935 "type": "python", 6936 "name": "transcripts_json", 6937 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6938 "available": True, 6939 "function_name": "calculation_transcripts_annotation", 6940 "function_params": ["transcripts_json", None], 6941 }, 6942 "transcripts_ann": { 6943 "type": "python", 6944 "name": "transcripts_ann", 6945 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6946 "available": True, 6947 "function_name": "calculation_transcripts_annotation", 6948 "function_params": [None, "transcripts_ann"], 6949 }, 6950 "transcripts_annotations": { 6951 "type": "python", 6952 "name": "transcripts_annotations", 6953 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6954 "available": True, 6955 "function_name": "calculation_transcripts_annotation", 6956 "function_params": [None, None], 6957 }, 6958 "transcripts_prioritization": { 6959 "type": "python", 6960 "name": "transcripts_prioritization", 6961 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6962 "available": True, 6963 "function_name": "calculation_transcripts_prioritization", 6964 "function_params": [], 6965 }, 6966 "transcripts_export": { 6967 "type": "python", 6968 "name": "transcripts_export", 6969 "description": "Export transcripts table/view as a file (using param.json)", 6970 "available": True, 6971 "function_name": "calculation_transcripts_export", 6972 "function_params": [], 6973 }, 6974 }, 6975 "prioritizations": { 6976 "default": { 6977 "ANN2": [ 6978 { 6979 "type": "contains", 6980 "value": "HIGH", 6981 "score": 5, 6982 "flag": "PASS", 6983 "comment": [ 6984 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6985 ], 6986 }, 6987 { 6988 "type": "contains", 6989 "value": "MODERATE", 6990 "score": 3, 6991 "flag": "PASS", 6992 "comment": [ 6993 "A non-disruptive variant that might change protein effectiveness" 6994 ], 6995 }, 6996 { 6997 "type": "contains", 6998 "value": "LOW", 6999 "score": 0, 7000 "flag": "FILTERED", 7001 "comment": [ 7002 "Assumed to be mostly harmless or unlikely to change protein behavior" 7003 ], 7004 }, 7005 { 7006 "type": "contains", 7007 "value": "MODIFIER", 7008 "score": 0, 7009 "flag": "FILTERED", 7010 "comment": [ 7011 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7012 ], 7013 }, 7014 ], 7015 } 7016 }, 7017 } 7018 7019 return config_default.get(name, None) 7020 7021 def get_config_json( 7022 self, name: str, config_dict: dict = {}, config_file: str = None 7023 ) -> dict: 7024 """ 7025 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7026 default values, a dictionary, and a file. 7027 7028 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7029 the name of the configuration. It is used to identify and retrieve the configuration settings 7030 for a specific component or module 7031 :type name: str 7032 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7033 dictionary that allows you to provide additional configuration settings or overrides. When you 7034 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7035 the key is the configuration setting you want to override or 7036 :type config_dict: dict 7037 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7038 specify the path to a configuration file that contains additional settings. If provided, the 7039 function will read the contents of this file and update the configuration dictionary with the 7040 values found in the file, overriding any existing values with the 7041 :type config_file: str 7042 :return: The function `get_config_json` returns a dictionary containing the configuration 7043 settings. 7044 """ 7045 7046 # Create with default prioritizations 7047 config_default = self.get_config_default(name=name) 7048 configuration = config_default 7049 # log.debug(f"configuration={configuration}") 7050 7051 # Replace prioritizations from dict 7052 for config in config_dict: 7053 configuration[config] = config_dict[config] 7054 7055 # Replace prioritizations from file 7056 config_file = full_path(config_file) 7057 if config_file: 7058 if os.path.exists(config_file): 7059 with open(config_file) as config_file_content: 7060 config_file_dict = yaml.safe_load(config_file_content) 7061 for config in config_file_dict: 7062 configuration[config] = config_file_dict[config] 7063 else: 7064 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7065 log.error(msg_error) 7066 raise ValueError(msg_error) 7067 7068 return configuration 7069 7070 def prioritization( 7071 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7072 ) -> bool: 7073 """ 7074 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7075 prioritizes variants based on configured profiles and criteria. 7076 7077 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7078 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7079 a table name is provided, the method will prioritize the variants in that specific table 7080 :type table: str 7081 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7082 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7083 provided, the code will use a default prefix value of "PZ" 7084 :type pz_prefix: str 7085 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7086 additional parameters specific to the prioritization process. These parameters can include 7087 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7088 configurations needed for the prioritization of variants in a V 7089 :type pz_param: dict 7090 :return: A boolean value (True) is being returned from the `prioritization` function. 7091 """ 7092 7093 # Config 7094 config = self.get_config() 7095 7096 # Param 7097 param = self.get_param() 7098 7099 # Prioritization param 7100 if pz_param is not None: 7101 prioritization_param = pz_param 7102 else: 7103 prioritization_param = param.get("prioritization", {}) 7104 7105 # Configuration profiles 7106 prioritization_config_file = prioritization_param.get( 7107 "prioritization_config", None 7108 ) 7109 prioritization_config_file = full_path(prioritization_config_file) 7110 prioritizations_config = self.get_config_json( 7111 name="prioritizations", config_file=prioritization_config_file 7112 ) 7113 7114 # Prioritization prefix 7115 pz_prefix_default = "PZ" 7116 if pz_prefix is None: 7117 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7118 7119 # Prioritization options 7120 profiles = prioritization_param.get("profiles", []) 7121 if isinstance(profiles, str): 7122 profiles = profiles.split(",") 7123 pzfields = prioritization_param.get( 7124 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7125 ) 7126 if isinstance(pzfields, str): 7127 pzfields = pzfields.split(",") 7128 default_profile = prioritization_param.get("default_profile", None) 7129 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7130 prioritization_score_mode = prioritization_param.get( 7131 "prioritization_score_mode", "HOWARD" 7132 ) 7133 7134 # Quick Prioritizations 7135 prioritizations = param.get("prioritizations", None) 7136 if prioritizations: 7137 log.info("Quick Prioritization:") 7138 for profile in prioritizations.split(","): 7139 if profile not in profiles: 7140 profiles.append(profile) 7141 log.info(f" {profile}") 7142 7143 # If profile "ALL" provided, all profiles in the config profiles 7144 if "ALL" in profiles: 7145 profiles = list(prioritizations_config.keys()) 7146 7147 for profile in profiles: 7148 if prioritizations_config.get(profile, None): 7149 log.debug(f"Profile '{profile}' configured") 7150 else: 7151 msg_error = f"Profile '{profile}' NOT configured" 7152 log.error(msg_error) 7153 raise ValueError(msg_error) 7154 7155 if profiles: 7156 log.info(f"Prioritization... ") 7157 else: 7158 log.debug(f"No profile defined") 7159 return False 7160 7161 if not default_profile and len(profiles): 7162 default_profile = profiles[0] 7163 7164 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7165 log.debug("Profiles to check: " + str(list(profiles))) 7166 7167 # Variables 7168 if table is not None: 7169 table_variants = table 7170 else: 7171 table_variants = self.get_table_variants(clause="update") 7172 log.debug(f"Table to prioritize: {table_variants}") 7173 7174 # Added columns 7175 added_columns = [] 7176 7177 # Create list of PZfields 7178 # List of PZFields 7179 list_of_pzfields_original = pzfields + [ 7180 pzfield + pzfields_sep + profile 7181 for pzfield in pzfields 7182 for profile in profiles 7183 ] 7184 list_of_pzfields = [] 7185 log.debug(f"{list_of_pzfields_original}") 7186 7187 # Remove existing PZfields to use if exists 7188 for pzfield in list_of_pzfields_original: 7189 if self.get_header().infos.get(pzfield, None) is None: 7190 list_of_pzfields.append(pzfield) 7191 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7192 else: 7193 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7194 7195 if list_of_pzfields: 7196 7197 # Explode Infos prefix 7198 explode_infos_prefix = self.get_explode_infos_prefix() 7199 7200 # PZfields tags description 7201 PZfields_INFOS = { 7202 f"{pz_prefix}Tags": { 7203 "ID": f"{pz_prefix}Tags", 7204 "Number": ".", 7205 "Type": "String", 7206 "Description": "Variant tags based on annotation criteria", 7207 }, 7208 f"{pz_prefix}Score": { 7209 "ID": f"{pz_prefix}Score", 7210 "Number": 1, 7211 "Type": "Integer", 7212 "Description": "Variant score based on annotation criteria", 7213 }, 7214 f"{pz_prefix}Flag": { 7215 "ID": f"{pz_prefix}Flag", 7216 "Number": 1, 7217 "Type": "String", 7218 "Description": "Variant flag based on annotation criteria", 7219 }, 7220 f"{pz_prefix}Comment": { 7221 "ID": f"{pz_prefix}Comment", 7222 "Number": ".", 7223 "Type": "String", 7224 "Description": "Variant comment based on annotation criteria", 7225 }, 7226 f"{pz_prefix}Infos": { 7227 "ID": f"{pz_prefix}Infos", 7228 "Number": ".", 7229 "Type": "String", 7230 "Description": "Variant infos based on annotation criteria", 7231 }, 7232 f"{pz_prefix}Class": { 7233 "ID": f"{pz_prefix}Class", 7234 "Number": ".", 7235 "Type": "String", 7236 "Description": "Variant class based on annotation criteria", 7237 }, 7238 } 7239 7240 # Create INFO fields if not exist 7241 for field in PZfields_INFOS: 7242 field_ID = PZfields_INFOS[field]["ID"] 7243 field_description = PZfields_INFOS[field]["Description"] 7244 if field_ID not in self.get_header().infos and field_ID in pzfields: 7245 field_description = ( 7246 PZfields_INFOS[field]["Description"] 7247 + f", profile {default_profile}" 7248 ) 7249 self.get_header().infos[field_ID] = vcf.parser._Info( 7250 field_ID, 7251 PZfields_INFOS[field]["Number"], 7252 PZfields_INFOS[field]["Type"], 7253 field_description, 7254 "unknown", 7255 "unknown", 7256 code_type_map[PZfields_INFOS[field]["Type"]], 7257 ) 7258 7259 # Create INFO fields if not exist for each profile 7260 for profile in prioritizations_config: 7261 if profile in profiles or profiles == []: 7262 for field in PZfields_INFOS: 7263 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7264 field_description = ( 7265 PZfields_INFOS[field]["Description"] 7266 + f", profile {profile}" 7267 ) 7268 if ( 7269 field_ID not in self.get_header().infos 7270 and field in pzfields 7271 ): 7272 self.get_header().infos[field_ID] = vcf.parser._Info( 7273 field_ID, 7274 PZfields_INFOS[field]["Number"], 7275 PZfields_INFOS[field]["Type"], 7276 field_description, 7277 "unknown", 7278 "unknown", 7279 code_type_map[PZfields_INFOS[field]["Type"]], 7280 ) 7281 7282 # Header 7283 for pzfield in list_of_pzfields: 7284 if re.match(f"{pz_prefix}Score.*", pzfield): 7285 added_column = self.add_column( 7286 table_name=table_variants, 7287 column_name=pzfield, 7288 column_type="INTEGER", 7289 default_value="0", 7290 ) 7291 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7292 added_column = self.add_column( 7293 table_name=table_variants, 7294 column_name=pzfield, 7295 column_type="BOOLEAN", 7296 default_value="1", 7297 ) 7298 elif re.match(f"{pz_prefix}Class.*", pzfield): 7299 added_column = self.add_column( 7300 table_name=table_variants, 7301 column_name=pzfield, 7302 column_type="VARCHAR[]", 7303 default_value="null", 7304 ) 7305 else: 7306 added_column = self.add_column( 7307 table_name=table_variants, 7308 column_name=pzfield, 7309 column_type="STRING", 7310 default_value="''", 7311 ) 7312 added_columns.append(added_column) 7313 7314 # Profiles 7315 if profiles: 7316 7317 # foreach profile in configuration file 7318 for profile in prioritizations_config: 7319 7320 # If profile is asked in param, or ALL are asked (empty profile []) 7321 if profile in profiles or profiles == []: 7322 log.info(f"Profile '{profile}'") 7323 7324 sql_set_info_option = "" 7325 7326 sql_set_info = [] 7327 7328 # PZ fields set 7329 7330 # PZScore 7331 if ( 7332 f"{pz_prefix}Score{pzfields_sep}{profile}" 7333 in list_of_pzfields 7334 ): 7335 sql_set_info.append( 7336 f""" 7337 concat( 7338 '{pz_prefix}Score{pzfields_sep}{profile}=', 7339 {pz_prefix}Score{pzfields_sep}{profile} 7340 ) 7341 """ 7342 ) 7343 if ( 7344 profile == default_profile 7345 and f"{pz_prefix}Score" in list_of_pzfields 7346 ): 7347 sql_set_info.append( 7348 f""" 7349 concat( 7350 '{pz_prefix}Score=', 7351 {pz_prefix}Score{pzfields_sep}{profile} 7352 ) 7353 """ 7354 ) 7355 7356 # PZFlag 7357 if ( 7358 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7359 in list_of_pzfields 7360 ): 7361 sql_set_info.append( 7362 f""" 7363 concat( 7364 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7365 CASE 7366 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7367 THEN 'PASS' 7368 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7369 THEN 'FILTERED' 7370 END 7371 ) 7372 """ 7373 ) 7374 if ( 7375 profile == default_profile 7376 and f"{pz_prefix}Flag" in list_of_pzfields 7377 ): 7378 sql_set_info.append( 7379 f""" 7380 concat( 7381 '{pz_prefix}Flag=', 7382 CASE 7383 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7384 THEN 'PASS' 7385 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7386 THEN 'FILTERED' 7387 END 7388 ) 7389 """ 7390 ) 7391 7392 # PZClass 7393 if ( 7394 f"{pz_prefix}Class{pzfields_sep}{profile}" 7395 in list_of_pzfields 7396 ): 7397 sql_set_info.append( 7398 f""" 7399 concat( 7400 '{pz_prefix}Class{pzfields_sep}{profile}=', 7401 CASE 7402 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7403 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7404 ELSE '.' 7405 END 7406 ) 7407 7408 """ 7409 ) 7410 if ( 7411 profile == default_profile 7412 and f"{pz_prefix}Class" in list_of_pzfields 7413 ): 7414 sql_set_info.append( 7415 f""" 7416 concat( 7417 '{pz_prefix}Class=', 7418 CASE 7419 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7420 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7421 ELSE '.' 7422 END 7423 ) 7424 """ 7425 ) 7426 7427 # PZComment 7428 if ( 7429 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7430 in list_of_pzfields 7431 ): 7432 sql_set_info.append( 7433 f""" 7434 CASE 7435 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7436 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7437 ELSE '' 7438 END 7439 """ 7440 ) 7441 if ( 7442 profile == default_profile 7443 and f"{pz_prefix}Comment" in list_of_pzfields 7444 ): 7445 sql_set_info.append( 7446 f""" 7447 CASE 7448 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7449 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7450 ELSE '' 7451 END 7452 """ 7453 ) 7454 7455 # PZInfos 7456 if ( 7457 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7458 in list_of_pzfields 7459 ): 7460 sql_set_info.append( 7461 f""" 7462 CASE 7463 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7464 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7465 ELSE '' 7466 END 7467 """ 7468 ) 7469 if ( 7470 profile == default_profile 7471 and f"{pz_prefix}Infos" in list_of_pzfields 7472 ): 7473 sql_set_info.append( 7474 f""" 7475 CASE 7476 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7477 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7478 ELSE '' 7479 END 7480 """ 7481 ) 7482 7483 # Merge PZfields 7484 sql_set_info_option = "" 7485 sql_set_sep = "" 7486 for sql_set in sql_set_info: 7487 if sql_set_sep: 7488 sql_set_info_option += f""" 7489 , concat('{sql_set_sep}', {sql_set}) 7490 """ 7491 else: 7492 sql_set_info_option += f""" 7493 , {sql_set} 7494 """ 7495 sql_set_sep = ";" 7496 7497 sql_queries = [] 7498 for annotation in prioritizations_config[profile]: 7499 7500 # skip special sections 7501 if annotation.startswith("_"): 7502 continue 7503 7504 # For each criterions 7505 for criterion in prioritizations_config[profile][ 7506 annotation 7507 ]: 7508 7509 # Criterion mode 7510 criterion_mode = None 7511 if np.any( 7512 np.isin(list(criterion.keys()), ["type", "value"]) 7513 ): 7514 criterion_mode = "operation" 7515 elif np.any( 7516 np.isin(list(criterion.keys()), ["sql", "fields"]) 7517 ): 7518 criterion_mode = "sql" 7519 log.debug(f"Criterion Mode: {criterion_mode}") 7520 7521 # Criterion parameters 7522 criterion_type = criterion.get("type", None) 7523 criterion_value = criterion.get("value", None) 7524 criterion_sql = criterion.get("sql", None) 7525 criterion_fields = criterion.get("fields", None) 7526 criterion_score = criterion.get("score", 0) 7527 criterion_flag = criterion.get("flag", "PASS") 7528 criterion_class = criterion.get("class", None) 7529 criterion_flag_bool = criterion_flag == "PASS" 7530 criterion_comment = ( 7531 ", ".join(criterion.get("comment", [])) 7532 .replace("'", "''") 7533 .replace(";", ",") 7534 .replace("\t", " ") 7535 ) 7536 criterion_infos = ( 7537 str(criterion) 7538 .replace("'", "''") 7539 .replace(";", ",") 7540 .replace("\t", " ") 7541 ) 7542 7543 # SQL 7544 if criterion_sql is not None and isinstance( 7545 criterion_sql, list 7546 ): 7547 criterion_sql = " ".join(criterion_sql) 7548 7549 # Fields and explode 7550 if criterion_fields is None: 7551 criterion_fields = [annotation] 7552 if not isinstance(criterion_fields, list): 7553 criterion_fields = str(criterion_fields).split(",") 7554 7555 # Class 7556 if criterion_class is not None and not isinstance( 7557 criterion_class, list 7558 ): 7559 criterion_class = str(criterion_class).split(",") 7560 7561 for annotation_field in criterion_fields: 7562 7563 # Explode specific annotation 7564 log.debug( 7565 f"Explode annotation '{annotation_field}'" 7566 ) 7567 added_columns += self.explode_infos( 7568 prefix=explode_infos_prefix, 7569 fields=[annotation_field], 7570 table=table_variants, 7571 ) 7572 extra_infos = self.get_extra_infos( 7573 table=table_variants 7574 ) 7575 7576 # Check if annotation field is present 7577 if ( 7578 f"{explode_infos_prefix}{annotation_field}" 7579 not in extra_infos 7580 ): 7581 msq_err = f"Annotation '{annotation_field}' not in data" 7582 log.error(msq_err) 7583 raise ValueError(msq_err) 7584 else: 7585 log.debug( 7586 f"Annotation '{annotation_field}' in data" 7587 ) 7588 7589 sql_set = [] 7590 sql_set_info = [] 7591 7592 # PZ fields set 7593 7594 # PZScore 7595 if ( 7596 f"{pz_prefix}Score{pzfields_sep}{profile}" 7597 in list_of_pzfields 7598 ): 7599 # VaRank prioritization score mode 7600 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7601 sql_set.append( 7602 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7603 ) 7604 # default HOWARD prioritization score mode 7605 else: 7606 sql_set.append( 7607 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7608 ) 7609 7610 # PZFlag 7611 if ( 7612 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7613 in list_of_pzfields 7614 ): 7615 sql_set.append( 7616 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7617 ) 7618 7619 # PZClass 7620 if ( 7621 f"{pz_prefix}Class{pzfields_sep}{profile}" 7622 in list_of_pzfields 7623 and criterion_class is not None 7624 ): 7625 sql_set.append( 7626 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7627 ) 7628 7629 # PZComment 7630 if ( 7631 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7632 in list_of_pzfields 7633 ): 7634 sql_set.append( 7635 f""" 7636 {pz_prefix}Comment{pzfields_sep}{profile} = 7637 concat( 7638 {pz_prefix}Comment{pzfields_sep}{profile}, 7639 CASE 7640 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7641 THEN ', ' 7642 ELSE '' 7643 END, 7644 '{criterion_comment}' 7645 ) 7646 """ 7647 ) 7648 7649 # PZInfos 7650 if ( 7651 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7652 in list_of_pzfields 7653 ): 7654 sql_set.append( 7655 f""" 7656 {pz_prefix}Infos{pzfields_sep}{profile} = 7657 concat( 7658 {pz_prefix}Infos{pzfields_sep}{profile}, 7659 '{criterion_infos}' 7660 ) 7661 """ 7662 ) 7663 sql_set_option = ",".join(sql_set) 7664 7665 # Criterion and comparison 7666 if sql_set_option: 7667 7668 if criterion_mode in ["operation"]: 7669 7670 try: 7671 float(criterion_value) 7672 sql_update = f""" 7673 UPDATE {table_variants} 7674 SET {sql_set_option} 7675 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7676 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7677 """ 7678 except: 7679 contains_option = "" 7680 if criterion_type == "contains": 7681 contains_option = ".*" 7682 sql_update = f""" 7683 UPDATE {table_variants} 7684 SET {sql_set_option} 7685 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7686 """ 7687 sql_queries.append(sql_update) 7688 7689 elif criterion_mode in ["sql"]: 7690 7691 sql_update = f""" 7692 UPDATE {table_variants} 7693 SET {sql_set_option} 7694 WHERE {criterion_sql} 7695 """ 7696 sql_queries.append(sql_update) 7697 7698 else: 7699 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7700 log.error(msg_err) 7701 raise ValueError(msg_err) 7702 7703 else: 7704 log.warning( 7705 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7706 ) 7707 7708 # PZTags 7709 if ( 7710 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7711 in list_of_pzfields 7712 ): 7713 7714 # Create PZFalgs value 7715 pztags_value = "" 7716 pztags_sep_default = "," 7717 pztags_sep = "" 7718 for pzfield in pzfields: 7719 if pzfield not in [f"{pz_prefix}Tags"]: 7720 if ( 7721 f"{pzfield}{pzfields_sep}{profile}" 7722 in list_of_pzfields 7723 ): 7724 if pzfield in [f"{pz_prefix}Flag"]: 7725 pztags_value += f"""{pztags_sep}{pzfield}#', 7726 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7727 THEN 'PASS' 7728 ELSE 'FILTERED' 7729 END, '""" 7730 elif pzfield in [f"{pz_prefix}Class"]: 7731 pztags_value += f"""{pztags_sep}{pzfield}#', 7732 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7733 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7734 ELSE '.' 7735 END, '""" 7736 else: 7737 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7738 pztags_sep = pztags_sep_default 7739 7740 # Add Query update for PZFlags 7741 sql_update_pztags = f""" 7742 UPDATE {table_variants} 7743 SET INFO = concat( 7744 INFO, 7745 CASE WHEN INFO NOT in ('','.') 7746 THEN ';' 7747 ELSE '' 7748 END, 7749 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7750 ) 7751 """ 7752 sql_queries.append(sql_update_pztags) 7753 7754 # Add Query update for PZFlags for default 7755 if profile == default_profile: 7756 sql_update_pztags_default = f""" 7757 UPDATE {table_variants} 7758 SET INFO = concat( 7759 INFO, 7760 ';', 7761 '{pz_prefix}Tags={pztags_value}' 7762 ) 7763 """ 7764 sql_queries.append(sql_update_pztags_default) 7765 7766 log.info(f"""Profile '{profile}' - Prioritization... """) 7767 7768 if sql_queries: 7769 7770 for sql_query in sql_queries: 7771 log.debug( 7772 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7773 ) 7774 self.conn.execute(sql_query) 7775 7776 log.info(f"""Profile '{profile}' - Update... """) 7777 sql_query_update = f""" 7778 UPDATE {table_variants} 7779 SET INFO = 7780 concat( 7781 CASE 7782 WHEN INFO NOT IN ('','.') 7783 THEN concat(INFO, ';') 7784 ELSE '' 7785 END 7786 {sql_set_info_option} 7787 ) 7788 """ 7789 self.conn.execute(sql_query_update) 7790 7791 else: 7792 7793 log.warning(f"No profiles in parameters") 7794 7795 # Remove added columns 7796 for added_column in added_columns: 7797 self.drop_column(column=added_column) 7798 7799 # Explode INFOS fields into table fields 7800 if self.get_explode_infos(): 7801 self.explode_infos( 7802 prefix=self.get_explode_infos_prefix(), 7803 fields=self.get_explode_infos_fields(), 7804 force=True, 7805 ) 7806 7807 return True 7808 7809 ### 7810 # HGVS 7811 ### 7812 7813 def annotation_hgvs(self, threads: int = None) -> None: 7814 """ 7815 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7816 coordinates and alleles. 7817 7818 :param threads: The `threads` parameter is an optional integer that specifies the number of 7819 threads to use for parallel processing. If no value is provided, it will default to the number 7820 of threads obtained from the `get_threads()` method 7821 :type threads: int 7822 """ 7823 7824 # Function for each partition of the Dask Dataframe 7825 def partition_function(partition): 7826 """ 7827 The function `partition_function` applies the `annotation_hgvs_partition` function to 7828 each row of a DataFrame called `partition`. 7829 7830 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7831 to be processed 7832 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7833 the "partition" dataframe along the axis 1. 7834 """ 7835 return partition.apply(annotation_hgvs_partition, axis=1) 7836 7837 def annotation_hgvs_partition(row) -> str: 7838 """ 7839 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7840 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7841 7842 :param row: A dictionary-like object that contains the values for the following keys: 7843 :return: a string that contains the HGVS names associated with the given row of data. 7844 """ 7845 7846 chr = row["CHROM"] 7847 pos = row["POS"] 7848 ref = row["REF"] 7849 alt = row["ALT"] 7850 7851 # Find list of associated transcripts 7852 transcripts_list = list( 7853 polars_conn.execute( 7854 f""" 7855 SELECT transcript 7856 FROM refseq_df 7857 WHERE CHROM='{chr}' 7858 AND POS={pos} 7859 """ 7860 )["transcript"] 7861 ) 7862 7863 # Full HGVS annotation in list 7864 hgvs_full_list = [] 7865 7866 for transcript_name in transcripts_list: 7867 7868 # Transcript 7869 transcript = get_transcript( 7870 transcripts=transcripts, transcript_name=transcript_name 7871 ) 7872 # Exon 7873 if use_exon: 7874 exon = transcript.find_exon_number(pos) 7875 else: 7876 exon = None 7877 # Protein 7878 transcript_protein = None 7879 if use_protein or add_protein or full_format: 7880 transcripts_protein = list( 7881 polars_conn.execute( 7882 f""" 7883 SELECT protein 7884 FROM refseqlink_df 7885 WHERE transcript='{transcript_name}' 7886 LIMIT 1 7887 """ 7888 )["protein"] 7889 ) 7890 if len(transcripts_protein): 7891 transcript_protein = transcripts_protein[0] 7892 7893 # HGVS name 7894 hgvs_name = format_hgvs_name( 7895 chr, 7896 pos, 7897 ref, 7898 alt, 7899 genome=genome, 7900 transcript=transcript, 7901 transcript_protein=transcript_protein, 7902 exon=exon, 7903 use_gene=use_gene, 7904 use_protein=use_protein, 7905 full_format=full_format, 7906 use_version=use_version, 7907 codon_type=codon_type, 7908 ) 7909 hgvs_full_list.append(hgvs_name) 7910 if add_protein and not use_protein and not full_format: 7911 hgvs_name = format_hgvs_name( 7912 chr, 7913 pos, 7914 ref, 7915 alt, 7916 genome=genome, 7917 transcript=transcript, 7918 transcript_protein=transcript_protein, 7919 exon=exon, 7920 use_gene=use_gene, 7921 use_protein=True, 7922 full_format=False, 7923 use_version=use_version, 7924 codon_type=codon_type, 7925 ) 7926 hgvs_full_list.append(hgvs_name) 7927 7928 # Create liste of HGVS annotations 7929 hgvs_full = ",".join(hgvs_full_list) 7930 7931 return hgvs_full 7932 7933 # Polars connexion 7934 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7935 7936 # Config 7937 config = self.get_config() 7938 7939 # Databases 7940 # Genome 7941 databases_genomes_folders = ( 7942 config.get("folders", {}) 7943 .get("databases", {}) 7944 .get("genomes", DEFAULT_GENOME_FOLDER) 7945 ) 7946 databases_genome = ( 7947 config.get("folders", {}).get("databases", {}).get("genomes", "") 7948 ) 7949 # refseq database folder 7950 databases_refseq_folders = ( 7951 config.get("folders", {}) 7952 .get("databases", {}) 7953 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7954 ) 7955 # refseq 7956 databases_refseq = config.get("databases", {}).get("refSeq", None) 7957 # refSeqLink 7958 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7959 7960 # Param 7961 param = self.get_param() 7962 7963 # Quick HGVS 7964 if "hgvs_options" in param and param.get("hgvs_options", ""): 7965 log.info(f"Quick HGVS Annotation:") 7966 if not param.get("hgvs", None): 7967 param["hgvs"] = {} 7968 for option in param.get("hgvs_options", "").split(","): 7969 option_var_val = option.split("=") 7970 option_var = option_var_val[0] 7971 if len(option_var_val) > 1: 7972 option_val = option_var_val[1] 7973 else: 7974 option_val = "True" 7975 if option_val.upper() in ["TRUE"]: 7976 option_val = True 7977 elif option_val.upper() in ["FALSE"]: 7978 option_val = False 7979 log.info(f" {option_var}={option_val}") 7980 param["hgvs"][option_var] = option_val 7981 7982 # Check if HGVS annotation enabled 7983 if "hgvs" in param: 7984 log.info(f"HGVS Annotation... ") 7985 for hgvs_option in param.get("hgvs", {}): 7986 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7987 else: 7988 return 7989 7990 # HGVS Param 7991 param_hgvs = param.get("hgvs", {}) 7992 use_exon = param_hgvs.get("use_exon", False) 7993 use_gene = param_hgvs.get("use_gene", False) 7994 use_protein = param_hgvs.get("use_protein", False) 7995 add_protein = param_hgvs.get("add_protein", False) 7996 full_format = param_hgvs.get("full_format", False) 7997 use_version = param_hgvs.get("use_version", False) 7998 codon_type = param_hgvs.get("codon_type", "3") 7999 8000 # refSseq refSeqLink 8001 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8002 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8003 8004 # Assembly 8005 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8006 8007 # Genome 8008 genome_file = None 8009 if find_genome(databases_genome): 8010 genome_file = find_genome(databases_genome) 8011 else: 8012 genome_file = find_genome( 8013 genome_path=databases_genomes_folders, assembly=assembly 8014 ) 8015 log.debug("Genome: " + str(genome_file)) 8016 8017 # refSseq 8018 refseq_file = find_file_prefix( 8019 input_file=databases_refseq, 8020 prefix="ncbiRefSeq", 8021 folder=databases_refseq_folders, 8022 assembly=assembly, 8023 ) 8024 log.debug("refSeq: " + str(refseq_file)) 8025 8026 # refSeqLink 8027 refseqlink_file = find_file_prefix( 8028 input_file=databases_refseqlink, 8029 prefix="ncbiRefSeqLink", 8030 folder=databases_refseq_folders, 8031 assembly=assembly, 8032 ) 8033 log.debug("refSeqLink: " + str(refseqlink_file)) 8034 8035 # Threads 8036 if not threads: 8037 threads = self.get_threads() 8038 log.debug("Threads: " + str(threads)) 8039 8040 # Variables 8041 table_variants = self.get_table_variants(clause="update") 8042 8043 # Get variants SNV and InDel only 8044 query_variants = f""" 8045 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8046 FROM {table_variants} 8047 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8048 """ 8049 df_variants = self.get_query_to_df(query_variants) 8050 8051 # Added columns 8052 added_columns = [] 8053 8054 # Add hgvs column in variants table 8055 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8056 added_column = self.add_column( 8057 table_variants, hgvs_column_name, "STRING", default_value=None 8058 ) 8059 added_columns.append(added_column) 8060 8061 log.debug(f"refSeq loading...") 8062 # refSeq in duckDB 8063 refseq_table = get_refseq_table( 8064 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8065 ) 8066 # Loading all refSeq in Dataframe 8067 refseq_query = f""" 8068 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8069 FROM {refseq_table} 8070 JOIN df_variants ON ( 8071 {refseq_table}.chrom = df_variants.CHROM 8072 AND {refseq_table}.txStart<=df_variants.POS 8073 AND {refseq_table}.txEnd>=df_variants.POS 8074 ) 8075 """ 8076 refseq_df = self.conn.query(refseq_query).pl() 8077 8078 if refseqlink_file: 8079 log.debug(f"refSeqLink loading...") 8080 # refSeqLink in duckDB 8081 refseqlink_table = get_refseq_table( 8082 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8083 ) 8084 # Loading all refSeqLink in Dataframe 8085 protacc_column = "protAcc_with_ver" 8086 mrnaacc_column = "mrnaAcc_with_ver" 8087 refseqlink_query = f""" 8088 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8089 FROM {refseqlink_table} 8090 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8091 WHERE protAcc_without_ver IS NOT NULL 8092 """ 8093 # Polars Dataframe 8094 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8095 8096 # Read RefSeq transcripts into a python dict/model. 8097 log.debug(f"Transcripts loading...") 8098 with tempfile.TemporaryDirectory() as tmpdir: 8099 transcripts_query = f""" 8100 COPY ( 8101 SELECT {refseq_table}.* 8102 FROM {refseq_table} 8103 JOIN df_variants ON ( 8104 {refseq_table}.chrom=df_variants.CHROM 8105 AND {refseq_table}.txStart<=df_variants.POS 8106 AND {refseq_table}.txEnd>=df_variants.POS 8107 ) 8108 ) 8109 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8110 """ 8111 self.conn.query(transcripts_query) 8112 with open(f"{tmpdir}/transcript.tsv") as infile: 8113 transcripts = read_transcripts(infile) 8114 8115 # Polars connexion 8116 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8117 8118 log.debug("Genome loading...") 8119 # Read genome sequence using pyfaidx. 8120 genome = Fasta(genome_file) 8121 8122 log.debug("Start annotation HGVS...") 8123 8124 # Create 8125 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8126 ddf = dd.from_pandas(df_variants, npartitions=threads) 8127 8128 # Use dask.dataframe.apply() to apply function on each partition 8129 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8130 8131 # Convert Dask DataFrame to Pandas Dataframe 8132 df = ddf.compute() 8133 8134 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8135 with tempfile.TemporaryDirectory() as tmpdir: 8136 df_parquet = os.path.join(tmpdir, "df.parquet") 8137 df.to_parquet(df_parquet) 8138 8139 # Update hgvs column 8140 update_variant_query = f""" 8141 UPDATE {table_variants} 8142 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8143 FROM read_parquet('{df_parquet}') as df 8144 WHERE variants."#CHROM" = df.CHROM 8145 AND variants.POS = df.POS 8146 AND variants.REF = df.REF 8147 AND variants.ALT = df.ALT 8148 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8149 """ 8150 self.execute_query(update_variant_query) 8151 8152 # Update INFO column 8153 sql_query_update = f""" 8154 UPDATE {table_variants} 8155 SET INFO = 8156 concat( 8157 CASE 8158 WHEN INFO NOT IN ('','.') 8159 THEN concat(INFO, ';') 8160 ELSE '' 8161 END, 8162 'hgvs=', 8163 {hgvs_column_name} 8164 ) 8165 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8166 """ 8167 self.execute_query(sql_query_update) 8168 8169 # Add header 8170 HGVS_INFOS = { 8171 "hgvs": { 8172 "ID": "hgvs", 8173 "Number": ".", 8174 "Type": "String", 8175 "Description": f"HGVS annotatation with HOWARD", 8176 } 8177 } 8178 8179 for field in HGVS_INFOS: 8180 field_ID = HGVS_INFOS[field]["ID"] 8181 field_description = HGVS_INFOS[field]["Description"] 8182 self.get_header().infos[field_ID] = vcf.parser._Info( 8183 field_ID, 8184 HGVS_INFOS[field]["Number"], 8185 HGVS_INFOS[field]["Type"], 8186 field_description, 8187 "unknown", 8188 "unknown", 8189 code_type_map[HGVS_INFOS[field]["Type"]], 8190 ) 8191 8192 # Remove added columns 8193 for added_column in added_columns: 8194 self.drop_column(column=added_column) 8195 8196 ### 8197 # Calculation 8198 ### 8199 8200 def get_operations_help( 8201 self, operations_config_dict: dict = {}, operations_config_file: str = None 8202 ) -> list: 8203 8204 # Init 8205 operations_help = [] 8206 8207 # operations 8208 operations = self.get_config_json( 8209 name="calculations", 8210 config_dict=operations_config_dict, 8211 config_file=operations_config_file, 8212 ) 8213 for op in operations: 8214 op_name = operations[op].get("name", op).upper() 8215 op_description = operations[op].get("description", op_name) 8216 op_available = operations[op].get("available", False) 8217 if op_available: 8218 operations_help.append(f" {op_name}: {op_description}") 8219 8220 # Sort operations 8221 operations_help.sort() 8222 8223 # insert header 8224 operations_help.insert(0, "Available calculation operations:") 8225 8226 # Return 8227 return operations_help 8228 8229 def calculation( 8230 self, 8231 operations: dict = {}, 8232 operations_config_dict: dict = {}, 8233 operations_config_file: str = None, 8234 ) -> None: 8235 """ 8236 It takes a list of operations, and for each operation, it checks if it's a python or sql 8237 operation, and then calls the appropriate function 8238 8239 param json example: 8240 "calculation": { 8241 "NOMEN": { 8242 "options": { 8243 "hgvs_field": "hgvs" 8244 }, 8245 "middle" : null 8246 } 8247 """ 8248 8249 # Param 8250 param = self.get_param() 8251 8252 # CHeck operations config file 8253 if operations_config_file is None: 8254 operations_config_file = param.get("calculation", {}).get( 8255 "calculation_config", None 8256 ) 8257 8258 # operations config 8259 operations_config = self.get_config_json( 8260 name="calculations", 8261 config_dict=operations_config_dict, 8262 config_file=operations_config_file, 8263 ) 8264 8265 # Upper keys 8266 operations_config = {k.upper(): v for k, v in operations_config.items()} 8267 8268 # Calculations 8269 8270 # Operations from param 8271 operations = param.get("calculation", {}).get("calculations", operations) 8272 8273 # Quick calculation - add 8274 if param.get("calculations", None): 8275 8276 # List of operations 8277 calculations_list = [ 8278 value.strip() for value in param.get("calculations", "").split(",") 8279 ] 8280 8281 # Log 8282 log.info(f"Quick Calculations:") 8283 for calculation_key in calculations_list: 8284 log.info(f" {calculation_key}") 8285 8286 # Create tmp operations (to keep operation order) 8287 operations_tmp = {} 8288 for calculation_operation in calculations_list: 8289 if calculation_operation.upper() not in operations_tmp: 8290 log.debug( 8291 f"{calculation_operation}.upper() not in {operations_tmp}" 8292 ) 8293 operations_tmp[calculation_operation.upper()] = {} 8294 add_value_into_dict( 8295 dict_tree=operations_tmp, 8296 sections=[ 8297 calculation_operation.upper(), 8298 ], 8299 value=operations.get(calculation_operation.upper(), {}), 8300 ) 8301 # Add operations already in param 8302 for calculation_operation in operations: 8303 if calculation_operation not in operations_tmp: 8304 operations_tmp[calculation_operation] = operations.get( 8305 calculation_operation, {} 8306 ) 8307 8308 # Update operations in param 8309 operations = operations_tmp 8310 8311 # Operations for calculation 8312 if not operations: 8313 operations = param.get("calculation", {}).get("calculations", {}) 8314 8315 if operations: 8316 log.info(f"Calculations...") 8317 8318 # For each operations 8319 for operation_name in operations: 8320 operation_name = operation_name.upper() 8321 if operation_name not in [""]: 8322 if operation_name in operations_config: 8323 log.info(f"Calculation '{operation_name}'") 8324 operation = operations_config[operation_name] 8325 operation_type = operation.get("type", "sql") 8326 if operation_type == "python": 8327 self.calculation_process_function( 8328 operation=operation, operation_name=operation_name 8329 ) 8330 elif operation_type == "sql": 8331 self.calculation_process_sql( 8332 operation=operation, operation_name=operation_name 8333 ) 8334 else: 8335 log.error( 8336 f"Operations config: Type '{operation_type}' NOT available" 8337 ) 8338 raise ValueError( 8339 f"Operations config: Type '{operation_type}' NOT available" 8340 ) 8341 else: 8342 log.error( 8343 f"Operations config: Calculation '{operation_name}' NOT available" 8344 ) 8345 raise ValueError( 8346 f"Operations config: Calculation '{operation_name}' NOT available" 8347 ) 8348 8349 # Explode INFOS fields into table fields 8350 if self.get_explode_infos(): 8351 self.explode_infos( 8352 prefix=self.get_explode_infos_prefix(), 8353 fields=self.get_explode_infos_fields(), 8354 force=True, 8355 ) 8356 8357 def calculation_process_sql( 8358 self, operation: dict, operation_name: str = "unknown" 8359 ) -> None: 8360 """ 8361 The `calculation_process_sql` function takes in a mathematical operation as a string and 8362 performs the operation, updating the specified table with the result. 8363 8364 :param operation: The `operation` parameter is a dictionary that contains information about the 8365 mathematical operation to be performed. It includes the following keys: 8366 :type operation: dict 8367 :param operation_name: The `operation_name` parameter is a string that represents the name of 8368 the mathematical operation being performed. It is used for logging and error handling purposes, 8369 defaults to unknown 8370 :type operation_name: str (optional) 8371 """ 8372 8373 # Operation infos 8374 operation_name = operation.get("name", "unknown") 8375 log.debug(f"process SQL {operation_name}") 8376 output_column_name = operation.get("output_column_name", operation_name) 8377 output_column_type = operation.get("output_column_type", "String") 8378 prefix = operation.get("explode_infos_prefix", "") 8379 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8380 output_column_description = operation.get( 8381 "output_column_description", f"{operation_name} operation" 8382 ) 8383 operation_query = operation.get("operation_query", None) 8384 if isinstance(operation_query, list): 8385 operation_query = " ".join(operation_query) 8386 operation_info_fields = operation.get("info_fields", []) 8387 operation_info_fields_check = operation.get("info_fields_check", False) 8388 operation_info = operation.get("operation_info", True) 8389 operation_table = operation.get( 8390 "table", self.get_table_variants(clause="alter") 8391 ) 8392 8393 # table variants 8394 if operation_table: 8395 table_variants = operation_table 8396 else: 8397 table_variants = self.get_table_variants(clause="alter") 8398 8399 if operation_query: 8400 8401 # Info fields check 8402 operation_info_fields_check_result = True 8403 if operation_info_fields_check: 8404 header_infos = self.get_header().infos 8405 for info_field in operation_info_fields: 8406 operation_info_fields_check_result = ( 8407 operation_info_fields_check_result 8408 and info_field in header_infos 8409 ) 8410 8411 # If info fields available 8412 if operation_info_fields_check_result: 8413 8414 # Added_columns 8415 added_columns = [] 8416 8417 # Create VCF header field 8418 vcf_reader = self.get_header() 8419 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8420 output_column_name, 8421 ".", 8422 output_column_type, 8423 output_column_description, 8424 "howard calculation", 8425 "0", 8426 self.code_type_map.get(output_column_type), 8427 ) 8428 8429 # Explode infos if needed 8430 log.debug(f"calculation_process_sql prefix {prefix}") 8431 added_columns += self.explode_infos( 8432 prefix=prefix, 8433 fields=[output_column_name] + operation_info_fields, 8434 force=False, 8435 table=table_variants, 8436 ) 8437 8438 # Create column 8439 added_column = self.add_column( 8440 table_name=table_variants, 8441 column_name=prefix + output_column_name, 8442 column_type=output_column_type_sql, 8443 default_value="null", 8444 ) 8445 added_columns.append(added_column) 8446 8447 # Operation calculation 8448 try: 8449 8450 # Query to update calculation column 8451 sql_update = f""" 8452 UPDATE {table_variants} 8453 SET "{prefix}{output_column_name}" = ({operation_query}) 8454 """ 8455 self.conn.execute(sql_update) 8456 8457 # Add to INFO 8458 if operation_info: 8459 sql_update_info = f""" 8460 UPDATE {table_variants} 8461 SET "INFO" = 8462 concat( 8463 CASE 8464 WHEN "INFO" IS NOT NULL 8465 THEN concat("INFO", ';') 8466 ELSE '' 8467 END, 8468 '{output_column_name}=', 8469 "{prefix}{output_column_name}" 8470 ) 8471 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8472 """ 8473 self.conn.execute(sql_update_info) 8474 8475 except: 8476 log.error( 8477 f"Operations config: Calculation '{operation_name}' query failed" 8478 ) 8479 raise ValueError( 8480 f"Operations config: Calculation '{operation_name}' query failed" 8481 ) 8482 8483 # Remove added columns 8484 for added_column in added_columns: 8485 log.debug(f"added_column: {added_column}") 8486 self.drop_column(column=added_column) 8487 8488 else: 8489 log.error( 8490 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8491 ) 8492 raise ValueError( 8493 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8494 ) 8495 8496 else: 8497 log.error( 8498 f"Operations config: Calculation '{operation_name}' query NOT defined" 8499 ) 8500 raise ValueError( 8501 f"Operations config: Calculation '{operation_name}' query NOT defined" 8502 ) 8503 8504 def calculation_process_function( 8505 self, operation: dict, operation_name: str = "unknown" 8506 ) -> None: 8507 """ 8508 The `calculation_process_function` takes in an operation dictionary and performs the specified 8509 function with the given parameters. 8510 8511 :param operation: The `operation` parameter is a dictionary that contains information about the 8512 operation to be performed. It has the following keys: 8513 :type operation: dict 8514 :param operation_name: The `operation_name` parameter is a string that represents the name of 8515 the operation being performed. It is used for logging purposes, defaults to unknown 8516 :type operation_name: str (optional) 8517 """ 8518 8519 operation_name = operation["name"] 8520 log.debug(f"process Python {operation_name}") 8521 function_name = operation["function_name"] 8522 function_params = operation["function_params"] 8523 getattr(self, function_name)(*function_params) 8524 8525 def calculation_variant_id(self) -> None: 8526 """ 8527 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8528 updates the INFO field of a variants table with the variant ID. 8529 """ 8530 8531 # variant_id annotation field 8532 variant_id_tag = self.get_variant_id_column() 8533 added_columns = [variant_id_tag] 8534 8535 # variant_id hgvs tags" 8536 vcf_infos_tags = { 8537 variant_id_tag: "howard variant ID annotation", 8538 } 8539 8540 # Variants table 8541 table_variants = self.get_table_variants() 8542 8543 # Header 8544 vcf_reader = self.get_header() 8545 8546 # Add variant_id to header 8547 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8548 variant_id_tag, 8549 ".", 8550 "String", 8551 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8552 "howard calculation", 8553 "0", 8554 self.code_type_map.get("String"), 8555 ) 8556 8557 # Update 8558 sql_update = f""" 8559 UPDATE {table_variants} 8560 SET "INFO" = 8561 concat( 8562 CASE 8563 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8564 THEN '' 8565 ELSE concat("INFO", ';') 8566 END, 8567 '{variant_id_tag}=', 8568 "{variant_id_tag}" 8569 ) 8570 """ 8571 self.conn.execute(sql_update) 8572 8573 # Remove added columns 8574 for added_column in added_columns: 8575 self.drop_column(column=added_column) 8576 8577 def calculation_extract_snpeff_hgvs( 8578 self, 8579 snpeff_hgvs: str = "snpeff_hgvs", 8580 snpeff_field: str = "ANN", 8581 ) -> None: 8582 """ 8583 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8584 annotation field in a VCF file and adds them as a new column in the variants table. 8585 8586 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8587 function is used to specify the name of the column that will store the HGVS nomenclatures 8588 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8589 snpeff_hgvs 8590 :type snpeff_hgvs: str (optional) 8591 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8592 function represents the field in the VCF file that contains SnpEff annotations. This field is 8593 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8594 to ANN 8595 :type snpeff_field: str (optional) 8596 """ 8597 8598 # Snpeff hgvs tags 8599 vcf_infos_tags = { 8600 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8601 } 8602 8603 # Prefix 8604 prefix = self.get_explode_infos_prefix() 8605 if prefix: 8606 prefix = "INFO/" 8607 8608 # snpEff fields 8609 speff_ann_infos = prefix + snpeff_field 8610 speff_hgvs_infos = prefix + snpeff_hgvs 8611 8612 # Variants table 8613 table_variants = self.get_table_variants() 8614 8615 # Header 8616 vcf_reader = self.get_header() 8617 8618 # Add columns 8619 added_columns = [] 8620 8621 # Explode HGVS field in column 8622 added_columns += self.explode_infos(fields=[snpeff_field]) 8623 8624 if snpeff_field in vcf_reader.infos: 8625 8626 log.debug(vcf_reader.infos[snpeff_field]) 8627 8628 # Extract ANN header 8629 ann_description = vcf_reader.infos[snpeff_field].desc 8630 pattern = r"'(.+?)'" 8631 match = re.search(pattern, ann_description) 8632 if match: 8633 ann_header_match = match.group(1).split(" | ") 8634 ann_header_desc = {} 8635 for i in range(len(ann_header_match)): 8636 ann_header_info = "".join( 8637 char for char in ann_header_match[i] if char.isalnum() 8638 ) 8639 ann_header_desc[ann_header_info] = ann_header_match[i] 8640 if not ann_header_desc: 8641 raise ValueError("Invalid header description format") 8642 else: 8643 raise ValueError("Invalid header description format") 8644 8645 # Create variant id 8646 variant_id_column = self.get_variant_id_column() 8647 added_columns += [variant_id_column] 8648 8649 # Create dataframe 8650 dataframe_snpeff_hgvs = self.get_query_to_df( 8651 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8652 ) 8653 8654 # Create main NOMEN column 8655 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8656 speff_ann_infos 8657 ].apply( 8658 lambda x: extract_snpeff_hgvs( 8659 str(x), header=list(ann_header_desc.values()) 8660 ) 8661 ) 8662 8663 # Add snpeff_hgvs to header 8664 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8665 snpeff_hgvs, 8666 ".", 8667 "String", 8668 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8669 "howard calculation", 8670 "0", 8671 self.code_type_map.get("String"), 8672 ) 8673 8674 # Update 8675 sql_update = f""" 8676 UPDATE variants 8677 SET "INFO" = 8678 concat( 8679 CASE 8680 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8681 THEN '' 8682 ELSE concat("INFO", ';') 8683 END, 8684 CASE 8685 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8686 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8687 THEN concat( 8688 '{snpeff_hgvs}=', 8689 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8690 ) 8691 ELSE '' 8692 END 8693 ) 8694 FROM dataframe_snpeff_hgvs 8695 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8696 8697 """ 8698 self.conn.execute(sql_update) 8699 8700 # Delete dataframe 8701 del dataframe_snpeff_hgvs 8702 gc.collect() 8703 8704 else: 8705 8706 log.warning( 8707 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8708 ) 8709 8710 # Remove added columns 8711 for added_column in added_columns: 8712 self.drop_column(column=added_column) 8713 8714 def calculation_snpeff_ann_explode( 8715 self, 8716 uniquify: bool = True, 8717 output_format: str = "fields", 8718 output_prefix: str = "snpeff_", 8719 snpeff_field: str = "ANN", 8720 ) -> None: 8721 """ 8722 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8723 exploding the HGVS field and updating variant information accordingly. 8724 8725 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8726 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8727 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8728 defaults to True 8729 :type uniquify: bool (optional) 8730 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8731 function specifies the format in which the output annotations will be generated. It has a 8732 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8733 format, defaults to fields 8734 :type output_format: str (optional) 8735 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8736 method is used to specify the prefix that will be added to the output annotations generated 8737 during the calculation process. This prefix helps to differentiate the newly added annotations 8738 from existing ones in the output data. By default, the, defaults to ANN_ 8739 :type output_prefix: str (optional) 8740 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8741 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8742 field will be processed to explode the HGVS annotations and update the variant information 8743 accordingly, defaults to ANN 8744 :type snpeff_field: str (optional) 8745 """ 8746 8747 # SnpEff annotation field 8748 snpeff_hgvs = "snpeff_ann_explode" 8749 8750 # Snpeff hgvs tags 8751 vcf_infos_tags = { 8752 snpeff_hgvs: "Explode snpEff annotations", 8753 } 8754 8755 # Prefix 8756 prefix = self.get_explode_infos_prefix() 8757 if prefix: 8758 prefix = "INFO/" 8759 8760 # snpEff fields 8761 speff_ann_infos = prefix + snpeff_field 8762 speff_hgvs_infos = prefix + snpeff_hgvs 8763 8764 # Variants table 8765 table_variants = self.get_table_variants() 8766 8767 # Header 8768 vcf_reader = self.get_header() 8769 8770 # Add columns 8771 added_columns = [] 8772 8773 # Explode HGVS field in column 8774 added_columns += self.explode_infos(fields=[snpeff_field]) 8775 log.debug(f"snpeff_field={snpeff_field}") 8776 log.debug(f"added_columns={added_columns}") 8777 8778 if snpeff_field in vcf_reader.infos: 8779 8780 # Extract ANN header 8781 ann_description = vcf_reader.infos[snpeff_field].desc 8782 pattern = r"'(.+?)'" 8783 match = re.search(pattern, ann_description) 8784 if match: 8785 ann_header_match = match.group(1).split(" | ") 8786 ann_header = [] 8787 ann_header_desc = {} 8788 for i in range(len(ann_header_match)): 8789 ann_header_info = "".join( 8790 char for char in ann_header_match[i] if char.isalnum() 8791 ) 8792 ann_header.append(ann_header_info) 8793 ann_header_desc[ann_header_info] = ann_header_match[i] 8794 if not ann_header_desc: 8795 raise ValueError("Invalid header description format") 8796 else: 8797 raise ValueError("Invalid header description format") 8798 8799 # Create variant id 8800 variant_id_column = self.get_variant_id_column() 8801 added_columns += [variant_id_column] 8802 8803 # Create dataframe 8804 dataframe_snpeff_hgvs = self.get_query_to_df( 8805 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8806 ) 8807 8808 # Create snpEff columns 8809 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8810 speff_ann_infos 8811 ].apply( 8812 lambda x: explode_snpeff_ann( 8813 str(x), 8814 uniquify=uniquify, 8815 output_format=output_format, 8816 prefix=output_prefix, 8817 header=list(ann_header_desc.values()), 8818 ) 8819 ) 8820 8821 # Header 8822 ann_annotations_prefix = "" 8823 if output_format.upper() in ["JSON"]: 8824 ann_annotations_prefix = f"{output_prefix}=" 8825 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8826 output_prefix, 8827 ".", 8828 "String", 8829 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8830 + " - JSON format", 8831 "howard calculation", 8832 "0", 8833 self.code_type_map.get("String"), 8834 ) 8835 else: 8836 for ann_annotation in ann_header: 8837 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8838 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8839 ann_annotation_id, 8840 ".", 8841 "String", 8842 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8843 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8844 "howard calculation", 8845 "0", 8846 self.code_type_map.get("String"), 8847 ) 8848 8849 # Update 8850 sql_update = f""" 8851 UPDATE variants 8852 SET "INFO" = 8853 concat( 8854 CASE 8855 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8856 THEN '' 8857 ELSE concat("INFO", ';') 8858 END, 8859 CASE 8860 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8861 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8862 THEN concat( 8863 '{ann_annotations_prefix}', 8864 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8865 ) 8866 ELSE '' 8867 END 8868 ) 8869 FROM dataframe_snpeff_hgvs 8870 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8871 8872 """ 8873 self.conn.execute(sql_update) 8874 8875 # Delete dataframe 8876 del dataframe_snpeff_hgvs 8877 gc.collect() 8878 8879 else: 8880 8881 log.warning( 8882 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8883 ) 8884 8885 # Remove added columns 8886 for added_column in added_columns: 8887 self.drop_column(column=added_column) 8888 8889 def calculation_extract_nomen(self) -> None: 8890 """ 8891 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8892 """ 8893 8894 # NOMEN field 8895 field_nomen_dict = "NOMEN_DICT" 8896 8897 # NOMEN structure 8898 nomen_dict = { 8899 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8900 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8901 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8902 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8903 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8904 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8905 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8906 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8907 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8908 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8909 } 8910 8911 # Param 8912 param = self.get_param() 8913 8914 # Prefix 8915 prefix = self.get_explode_infos_prefix() 8916 8917 # Header 8918 vcf_reader = self.get_header() 8919 8920 # Added columns 8921 added_columns = [] 8922 8923 # Get HGVS field 8924 hgvs_field = ( 8925 param.get("calculation", {}) 8926 .get("calculations", {}) 8927 .get("NOMEN", {}) 8928 .get("options", {}) 8929 .get("hgvs_field", "hgvs") 8930 ) 8931 8932 # Get NOMEN pattern 8933 nomen_pattern = ( 8934 param.get("calculation", {}) 8935 .get("calculations", {}) 8936 .get("NOMEN", {}) 8937 .get("options", {}) 8938 .get("pattern", None) 8939 ) 8940 8941 # transcripts list of preference sources 8942 transcripts_sources = {} 8943 8944 # Get transcripts 8945 transcripts_file = ( 8946 param.get("calculation", {}) 8947 .get("calculations", {}) 8948 .get("NOMEN", {}) 8949 .get("options", {}) 8950 .get("transcripts", None) 8951 ) 8952 transcripts_file = full_path(transcripts_file) 8953 if transcripts_file: 8954 if os.path.exists(transcripts_file): 8955 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8956 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8957 transcripts_sources["file"] = transcripts_from_file 8958 else: 8959 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8960 log.error(msg_err) 8961 raise ValueError(msg_err) 8962 8963 # Get transcripts table 8964 transcripts_table = ( 8965 param.get("calculation", {}) 8966 .get("calculations", {}) 8967 .get("NOMEN", {}) 8968 .get("options", {}) 8969 .get("transcripts_table", self.get_table_variants()) 8970 ) 8971 # Get transcripts column 8972 transcripts_column = ( 8973 param.get("calculation", {}) 8974 .get("calculations", {}) 8975 .get("NOMEN", {}) 8976 .get("options", {}) 8977 .get("transcripts_column", None) 8978 ) 8979 8980 if transcripts_table and transcripts_column: 8981 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8982 # Explode if not exists 8983 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8984 else: 8985 extra_field_transcript = f"NULL" 8986 8987 # Transcripts of preference source order 8988 transcripts_order = ( 8989 param.get("calculation", {}) 8990 .get("calculations", {}) 8991 .get("NOMEN", {}) 8992 .get("options", {}) 8993 .get("transcripts_order", ["column", "file"]) 8994 ) 8995 8996 # Transcripts from file 8997 transcripts = transcripts_sources.get("file", []) 8998 8999 # Explode HGVS field in column 9000 added_columns += self.explode_infos(fields=[hgvs_field]) 9001 9002 # extra infos 9003 extra_infos = self.get_extra_infos() 9004 extra_field = prefix + hgvs_field 9005 9006 if extra_field in extra_infos: 9007 9008 # Create dataframe 9009 dataframe_hgvs = self.get_query_to_df( 9010 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9011 ) 9012 9013 # Create main NOMEN column 9014 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9015 lambda x: find_nomen( 9016 hgvs=x.hgvs, 9017 transcript=x.transcript, 9018 transcripts=transcripts, 9019 pattern=nomen_pattern, 9020 transcripts_source_order=transcripts_order, 9021 ), 9022 axis=1, 9023 ) 9024 9025 # Explode NOMEN Structure and create SQL set for update 9026 sql_nomen_fields = [] 9027 for nomen_field in nomen_dict: 9028 9029 # Explode each field into a column 9030 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9031 lambda x: dict(x).get(nomen_field, "") 9032 ) 9033 9034 # Create VCF header field 9035 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9036 nomen_field, 9037 ".", 9038 "String", 9039 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9040 "howard calculation", 9041 "0", 9042 self.code_type_map.get("String"), 9043 ) 9044 sql_nomen_fields.append( 9045 f""" 9046 CASE 9047 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9048 THEN concat( 9049 ';{nomen_field}=', 9050 dataframe_hgvs."{nomen_field}" 9051 ) 9052 ELSE '' 9053 END 9054 """ 9055 ) 9056 9057 # SQL set for update 9058 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9059 9060 # Update 9061 sql_update = f""" 9062 UPDATE variants 9063 SET "INFO" = 9064 concat( 9065 CASE 9066 WHEN "INFO" IS NULL 9067 THEN '' 9068 ELSE "INFO" 9069 END, 9070 {sql_nomen_fields_set} 9071 ) 9072 FROM dataframe_hgvs 9073 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9074 AND variants."POS" = dataframe_hgvs."POS" 9075 AND variants."REF" = dataframe_hgvs."REF" 9076 AND variants."ALT" = dataframe_hgvs."ALT" 9077 """ 9078 self.conn.execute(sql_update) 9079 9080 # Delete dataframe 9081 del dataframe_hgvs 9082 gc.collect() 9083 9084 # Remove added columns 9085 for added_column in added_columns: 9086 self.drop_column(column=added_column) 9087 9088 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9089 """ 9090 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9091 pipeline/sample for a variant and updates the variant information in a VCF file. 9092 9093 :param tag: The `tag` parameter is a string that represents the annotation field for the 9094 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9095 VCF header and to update the corresponding field in the variants table, defaults to 9096 findbypipeline 9097 :type tag: str (optional) 9098 """ 9099 9100 # if FORMAT and samples 9101 if ( 9102 "FORMAT" in self.get_header_columns_as_list() 9103 and self.get_header_sample_list() 9104 ): 9105 9106 # findbypipeline annotation field 9107 findbypipeline_tag = tag 9108 9109 # VCF infos tags 9110 vcf_infos_tags = { 9111 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9112 } 9113 9114 # Prefix 9115 prefix = self.get_explode_infos_prefix() 9116 9117 # Field 9118 findbypipeline_infos = prefix + findbypipeline_tag 9119 9120 # Variants table 9121 table_variants = self.get_table_variants() 9122 9123 # Header 9124 vcf_reader = self.get_header() 9125 9126 # Create variant id 9127 variant_id_column = self.get_variant_id_column() 9128 added_columns = [variant_id_column] 9129 9130 # variant_id, FORMAT and samples 9131 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9132 self.get_header_sample_list() 9133 ) 9134 9135 # Create dataframe 9136 dataframe_findbypipeline = self.get_query_to_df( 9137 f""" SELECT {samples_fields} FROM {table_variants} """ 9138 ) 9139 9140 # Create findbypipeline column 9141 dataframe_findbypipeline[findbypipeline_infos] = ( 9142 dataframe_findbypipeline.apply( 9143 lambda row: findbypipeline( 9144 row, samples=self.get_header_sample_list() 9145 ), 9146 axis=1, 9147 ) 9148 ) 9149 9150 # Add snpeff_hgvs to header 9151 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9152 findbypipeline_tag, 9153 ".", 9154 "String", 9155 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9156 "howard calculation", 9157 "0", 9158 self.code_type_map.get("String"), 9159 ) 9160 9161 # Update 9162 sql_update = f""" 9163 UPDATE variants 9164 SET "INFO" = 9165 concat( 9166 CASE 9167 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9168 THEN '' 9169 ELSE concat("INFO", ';') 9170 END, 9171 CASE 9172 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9173 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9174 THEN concat( 9175 '{findbypipeline_tag}=', 9176 dataframe_findbypipeline."{findbypipeline_infos}" 9177 ) 9178 ELSE '' 9179 END 9180 ) 9181 FROM dataframe_findbypipeline 9182 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9183 """ 9184 self.conn.execute(sql_update) 9185 9186 # Remove added columns 9187 for added_column in added_columns: 9188 self.drop_column(column=added_column) 9189 9190 # Delete dataframe 9191 del dataframe_findbypipeline 9192 gc.collect() 9193 9194 def calculation_genotype_concordance(self) -> None: 9195 """ 9196 The function `calculation_genotype_concordance` calculates the genotype concordance for 9197 multi-caller VCF files and updates the variant information in the database. 9198 """ 9199 9200 # if FORMAT and samples 9201 if ( 9202 "FORMAT" in self.get_header_columns_as_list() 9203 and self.get_header_sample_list() 9204 ): 9205 9206 # genotypeconcordance annotation field 9207 genotypeconcordance_tag = "genotypeconcordance" 9208 9209 # VCF infos tags 9210 vcf_infos_tags = { 9211 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9212 } 9213 9214 # Prefix 9215 prefix = self.get_explode_infos_prefix() 9216 9217 # Field 9218 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9219 9220 # Variants table 9221 table_variants = self.get_table_variants() 9222 9223 # Header 9224 vcf_reader = self.get_header() 9225 9226 # Create variant id 9227 variant_id_column = self.get_variant_id_column() 9228 added_columns = [variant_id_column] 9229 9230 # variant_id, FORMAT and samples 9231 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9232 self.get_header_sample_list() 9233 ) 9234 9235 # Create dataframe 9236 dataframe_genotypeconcordance = self.get_query_to_df( 9237 f""" SELECT {samples_fields} FROM {table_variants} """ 9238 ) 9239 9240 # Create genotypeconcordance column 9241 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9242 dataframe_genotypeconcordance.apply( 9243 lambda row: genotypeconcordance( 9244 row, samples=self.get_header_sample_list() 9245 ), 9246 axis=1, 9247 ) 9248 ) 9249 9250 # Add genotypeconcordance to header 9251 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9252 genotypeconcordance_tag, 9253 ".", 9254 "String", 9255 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9256 "howard calculation", 9257 "0", 9258 self.code_type_map.get("String"), 9259 ) 9260 9261 # Update 9262 sql_update = f""" 9263 UPDATE variants 9264 SET "INFO" = 9265 concat( 9266 CASE 9267 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9268 THEN '' 9269 ELSE concat("INFO", ';') 9270 END, 9271 CASE 9272 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9273 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9274 THEN concat( 9275 '{genotypeconcordance_tag}=', 9276 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9277 ) 9278 ELSE '' 9279 END 9280 ) 9281 FROM dataframe_genotypeconcordance 9282 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9283 """ 9284 self.conn.execute(sql_update) 9285 9286 # Remove added columns 9287 for added_column in added_columns: 9288 self.drop_column(column=added_column) 9289 9290 # Delete dataframe 9291 del dataframe_genotypeconcordance 9292 gc.collect() 9293 9294 def calculation_barcode(self, tag: str = "barcode") -> None: 9295 """ 9296 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9297 updates the INFO field in the file with the calculated barcode values. 9298 9299 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9300 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9301 the default tag name is set to "barcode", defaults to barcode 9302 :type tag: str (optional) 9303 """ 9304 9305 # if FORMAT and samples 9306 if ( 9307 "FORMAT" in self.get_header_columns_as_list() 9308 and self.get_header_sample_list() 9309 ): 9310 9311 # barcode annotation field 9312 if not tag: 9313 tag = "barcode" 9314 9315 # VCF infos tags 9316 vcf_infos_tags = { 9317 tag: "barcode calculation (VaRank)", 9318 } 9319 9320 # Prefix 9321 prefix = self.get_explode_infos_prefix() 9322 9323 # Field 9324 barcode_infos = prefix + tag 9325 9326 # Variants table 9327 table_variants = self.get_table_variants() 9328 9329 # Header 9330 vcf_reader = self.get_header() 9331 9332 # Create variant id 9333 variant_id_column = self.get_variant_id_column() 9334 added_columns = [variant_id_column] 9335 9336 # variant_id, FORMAT and samples 9337 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9338 self.get_header_sample_list() 9339 ) 9340 9341 # Create dataframe 9342 dataframe_barcode = self.get_query_to_df( 9343 f""" SELECT {samples_fields} FROM {table_variants} """ 9344 ) 9345 9346 # Create barcode column 9347 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9348 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9349 ) 9350 9351 # Add barcode to header 9352 vcf_reader.infos[tag] = vcf.parser._Info( 9353 tag, 9354 ".", 9355 "String", 9356 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9357 "howard calculation", 9358 "0", 9359 self.code_type_map.get("String"), 9360 ) 9361 9362 # Update 9363 sql_update = f""" 9364 UPDATE {table_variants} 9365 SET "INFO" = 9366 concat( 9367 CASE 9368 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9369 THEN '' 9370 ELSE concat("INFO", ';') 9371 END, 9372 CASE 9373 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9374 AND dataframe_barcode."{barcode_infos}" NOT NULL 9375 THEN concat( 9376 '{tag}=', 9377 dataframe_barcode."{barcode_infos}" 9378 ) 9379 ELSE '' 9380 END 9381 ) 9382 FROM dataframe_barcode 9383 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9384 """ 9385 self.conn.execute(sql_update) 9386 9387 # Remove added columns 9388 for added_column in added_columns: 9389 self.drop_column(column=added_column) 9390 9391 # Delete dataframe 9392 del dataframe_barcode 9393 gc.collect() 9394 9395 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9396 """ 9397 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9398 and updates the INFO field in the file with the calculated barcode values. 9399 9400 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9401 the barcode tag that will be added to the VCF file during the calculation process. If no value 9402 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9403 :type tag: str (optional) 9404 """ 9405 9406 # if FORMAT and samples 9407 if ( 9408 "FORMAT" in self.get_header_columns_as_list() 9409 and self.get_header_sample_list() 9410 ): 9411 9412 # barcode annotation field 9413 if not tag: 9414 tag = "BCF" 9415 9416 # VCF infos tags 9417 vcf_infos_tags = { 9418 tag: "barcode family calculation", 9419 f"{tag}S": "barcode family samples", 9420 } 9421 9422 # Param 9423 param = self.get_param() 9424 log.debug(f"param={param}") 9425 9426 # Prefix 9427 prefix = self.get_explode_infos_prefix() 9428 9429 # PED param 9430 ped = ( 9431 param.get("calculation", {}) 9432 .get("calculations", {}) 9433 .get("BARCODEFAMILY", {}) 9434 .get("family_pedigree", None) 9435 ) 9436 log.debug(f"ped={ped}") 9437 9438 # Load PED 9439 if ped: 9440 9441 # Pedigree is a file 9442 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9443 log.debug("Pedigree is file") 9444 with open(full_path(ped)) as ped: 9445 ped = yaml.safe_load(ped) 9446 9447 # Pedigree is a string 9448 elif isinstance(ped, str): 9449 log.debug("Pedigree is str") 9450 try: 9451 ped = json.loads(ped) 9452 log.debug("Pedigree is json str") 9453 except ValueError as e: 9454 ped_samples = ped.split(",") 9455 ped = {} 9456 for ped_sample in ped_samples: 9457 ped[ped_sample] = ped_sample 9458 9459 # Pedigree is a dict 9460 elif isinstance(ped, dict): 9461 log.debug("Pedigree is dict") 9462 9463 # Pedigree is not well formatted 9464 else: 9465 msg_error = "Pedigree not well formatted" 9466 log.error(msg_error) 9467 raise ValueError(msg_error) 9468 9469 # Construct list 9470 ped_samples = list(ped.values()) 9471 9472 else: 9473 log.debug("Pedigree not defined. Take all samples") 9474 ped_samples = self.get_header_sample_list() 9475 ped = {} 9476 for ped_sample in ped_samples: 9477 ped[ped_sample] = ped_sample 9478 9479 # Check pedigree 9480 if not ped or len(ped) == 0: 9481 msg_error = f"Error in pedigree: samples {ped_samples}" 9482 log.error(msg_error) 9483 raise ValueError(msg_error) 9484 9485 # Log 9486 log.info( 9487 "Calculation 'BARCODEFAMILY' - Samples: " 9488 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9489 ) 9490 log.debug(f"ped_samples={ped_samples}") 9491 9492 # Field 9493 barcode_infos = prefix + tag 9494 9495 # Variants table 9496 table_variants = self.get_table_variants() 9497 9498 # Header 9499 vcf_reader = self.get_header() 9500 9501 # Create variant id 9502 variant_id_column = self.get_variant_id_column() 9503 added_columns = [variant_id_column] 9504 9505 # variant_id, FORMAT and samples 9506 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9507 ped_samples 9508 ) 9509 9510 # Create dataframe 9511 dataframe_barcode = self.get_query_to_df( 9512 f""" SELECT {samples_fields} FROM {table_variants} """ 9513 ) 9514 9515 # Create barcode column 9516 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9517 lambda row: barcode(row, samples=ped_samples), axis=1 9518 ) 9519 9520 # Add barcode family to header 9521 # Add vaf_normalization to header 9522 vcf_reader.formats[tag] = vcf.parser._Format( 9523 id=tag, 9524 num=".", 9525 type="String", 9526 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9527 type_code=self.code_type_map.get("String"), 9528 ) 9529 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9530 id=f"{tag}S", 9531 num=".", 9532 type="String", 9533 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9534 type_code=self.code_type_map.get("String"), 9535 ) 9536 9537 # Update 9538 # for sample in ped_samples: 9539 sql_update_set = [] 9540 for sample in self.get_header_sample_list() + ["FORMAT"]: 9541 if sample in ped_samples: 9542 value = f'dataframe_barcode."{barcode_infos}"' 9543 value_samples = "'" + ",".join(ped_samples) + "'" 9544 elif sample == "FORMAT": 9545 value = f"'{tag}'" 9546 value_samples = f"'{tag}S'" 9547 else: 9548 value = "'.'" 9549 value_samples = "'.'" 9550 format_regex = r"[a-zA-Z0-9\s]" 9551 sql_update_set.append( 9552 f""" 9553 "{sample}" = 9554 concat( 9555 CASE 9556 WHEN {table_variants}."{sample}" = './.' 9557 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9558 ELSE {table_variants}."{sample}" 9559 END, 9560 ':', 9561 {value}, 9562 ':', 9563 {value_samples} 9564 ) 9565 """ 9566 ) 9567 9568 sql_update_set_join = ", ".join(sql_update_set) 9569 sql_update = f""" 9570 UPDATE {table_variants} 9571 SET {sql_update_set_join} 9572 FROM dataframe_barcode 9573 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9574 """ 9575 self.conn.execute(sql_update) 9576 9577 # Remove added columns 9578 for added_column in added_columns: 9579 self.drop_column(column=added_column) 9580 9581 # Delete dataframe 9582 del dataframe_barcode 9583 gc.collect() 9584 9585 def calculation_trio(self) -> None: 9586 """ 9587 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9588 information to the INFO field of each variant. 9589 """ 9590 9591 # if FORMAT and samples 9592 if ( 9593 "FORMAT" in self.get_header_columns_as_list() 9594 and self.get_header_sample_list() 9595 ): 9596 9597 # trio annotation field 9598 trio_tag = "trio" 9599 9600 # VCF infos tags 9601 vcf_infos_tags = { 9602 "trio": "trio calculation", 9603 } 9604 9605 # Param 9606 param = self.get_param() 9607 9608 # Prefix 9609 prefix = self.get_explode_infos_prefix() 9610 9611 # Trio param 9612 trio_ped = ( 9613 param.get("calculation", {}) 9614 .get("calculations", {}) 9615 .get("TRIO", {}) 9616 .get("trio_pedigree", None) 9617 ) 9618 9619 # Load trio 9620 if trio_ped: 9621 9622 # Trio pedigree is a file 9623 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9624 log.debug("TRIO pedigree is file") 9625 with open(full_path(trio_ped)) as trio_ped: 9626 trio_ped = yaml.safe_load(trio_ped) 9627 9628 # Trio pedigree is a string 9629 elif isinstance(trio_ped, str): 9630 log.debug("TRIO pedigree is str") 9631 try: 9632 trio_ped = json.loads(trio_ped) 9633 log.debug("TRIO pedigree is json str") 9634 except ValueError as e: 9635 trio_samples = trio_ped.split(",") 9636 if len(trio_samples) == 3: 9637 trio_ped = { 9638 "father": trio_samples[0], 9639 "mother": trio_samples[1], 9640 "child": trio_samples[2], 9641 } 9642 log.debug("TRIO pedigree is list str") 9643 else: 9644 msg_error = "TRIO pedigree not well formatted" 9645 log.error(msg_error) 9646 raise ValueError(msg_error) 9647 9648 # Trio pedigree is a dict 9649 elif isinstance(trio_ped, dict): 9650 log.debug("TRIO pedigree is dict") 9651 9652 # Trio pedigree is not well formatted 9653 else: 9654 msg_error = "TRIO pedigree not well formatted" 9655 log.error(msg_error) 9656 raise ValueError(msg_error) 9657 9658 # Construct trio list 9659 trio_samples = [ 9660 trio_ped.get("father", ""), 9661 trio_ped.get("mother", ""), 9662 trio_ped.get("child", ""), 9663 ] 9664 9665 else: 9666 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9667 samples_list = self.get_header_sample_list() 9668 if len(samples_list) >= 3: 9669 trio_samples = self.get_header_sample_list()[0:3] 9670 trio_ped = { 9671 "father": trio_samples[0], 9672 "mother": trio_samples[1], 9673 "child": trio_samples[2], 9674 } 9675 else: 9676 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9677 log.error(msg_error) 9678 raise ValueError(msg_error) 9679 9680 # Check trio pedigree 9681 if not trio_ped or len(trio_ped) != 3: 9682 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9683 log.error(msg_error) 9684 raise ValueError(msg_error) 9685 9686 # Log 9687 log.info( 9688 f"Calculation 'TRIO' - Samples: " 9689 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9690 ) 9691 9692 # Field 9693 trio_infos = prefix + trio_tag 9694 9695 # Variants table 9696 table_variants = self.get_table_variants() 9697 9698 # Header 9699 vcf_reader = self.get_header() 9700 9701 # Create variant id 9702 variant_id_column = self.get_variant_id_column() 9703 added_columns = [variant_id_column] 9704 9705 # variant_id, FORMAT and samples 9706 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9707 self.get_header_sample_list() 9708 ) 9709 9710 # Create dataframe 9711 dataframe_trio = self.get_query_to_df( 9712 f""" SELECT {samples_fields} FROM {table_variants} """ 9713 ) 9714 9715 # Create trio column 9716 dataframe_trio[trio_infos] = dataframe_trio.apply( 9717 lambda row: trio(row, samples=trio_samples), axis=1 9718 ) 9719 9720 # Add trio to header 9721 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9722 trio_tag, 9723 ".", 9724 "String", 9725 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9726 "howard calculation", 9727 "0", 9728 self.code_type_map.get("String"), 9729 ) 9730 9731 # Update 9732 sql_update = f""" 9733 UPDATE {table_variants} 9734 SET "INFO" = 9735 concat( 9736 CASE 9737 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9738 THEN '' 9739 ELSE concat("INFO", ';') 9740 END, 9741 CASE 9742 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9743 AND dataframe_trio."{trio_infos}" NOT NULL 9744 THEN concat( 9745 '{trio_tag}=', 9746 dataframe_trio."{trio_infos}" 9747 ) 9748 ELSE '' 9749 END 9750 ) 9751 FROM dataframe_trio 9752 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9753 """ 9754 self.conn.execute(sql_update) 9755 9756 # Remove added columns 9757 for added_column in added_columns: 9758 self.drop_column(column=added_column) 9759 9760 # Delete dataframe 9761 del dataframe_trio 9762 gc.collect() 9763 9764 def calculation_vaf_normalization(self) -> None: 9765 """ 9766 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9767 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9768 :return: The function does not return anything. 9769 """ 9770 9771 # if FORMAT and samples 9772 if ( 9773 "FORMAT" in self.get_header_columns_as_list() 9774 and self.get_header_sample_list() 9775 ): 9776 9777 # vaf_normalization annotation field 9778 vaf_normalization_tag = "VAF" 9779 9780 # VCF infos tags 9781 vcf_infos_tags = { 9782 "VAF": "VAF Variant Frequency", 9783 } 9784 9785 # Prefix 9786 prefix = self.get_explode_infos_prefix() 9787 9788 # Variants table 9789 table_variants = self.get_table_variants() 9790 9791 # Header 9792 vcf_reader = self.get_header() 9793 9794 # Do not calculate if VAF already exists 9795 if "VAF" in vcf_reader.formats: 9796 log.debug("VAF already on genotypes") 9797 return 9798 9799 # Create variant id 9800 variant_id_column = self.get_variant_id_column() 9801 added_columns = [variant_id_column] 9802 9803 # variant_id, FORMAT and samples 9804 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9805 f""" "{sample}" """ for sample in self.get_header_sample_list() 9806 ) 9807 9808 # Create dataframe 9809 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9810 log.debug(f"query={query}") 9811 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9812 9813 vaf_normalization_set = [] 9814 9815 # for each sample vaf_normalization 9816 for sample in self.get_header_sample_list(): 9817 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9818 lambda row: vaf_normalization(row, sample=sample), axis=1 9819 ) 9820 vaf_normalization_set.append( 9821 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9822 ) 9823 9824 # Add VAF to FORMAT 9825 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9826 "FORMAT" 9827 ].apply(lambda x: str(x) + ":VAF") 9828 vaf_normalization_set.append( 9829 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9830 ) 9831 9832 # Add vaf_normalization to header 9833 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9834 id=vaf_normalization_tag, 9835 num="1", 9836 type="Float", 9837 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9838 type_code=self.code_type_map.get("Float"), 9839 ) 9840 9841 # Create fields to add in INFO 9842 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9843 9844 # Update 9845 sql_update = f""" 9846 UPDATE {table_variants} 9847 SET {sql_vaf_normalization_set} 9848 FROM dataframe_vaf_normalization 9849 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9850 9851 """ 9852 self.conn.execute(sql_update) 9853 9854 # Remove added columns 9855 for added_column in added_columns: 9856 self.drop_column(column=added_column) 9857 9858 # Delete dataframe 9859 del dataframe_vaf_normalization 9860 gc.collect() 9861 9862 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9863 """ 9864 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9865 field in a VCF file and updates the INFO column of the variants table with the calculated 9866 statistics. 9867 9868 :param info: The `info` parameter is a string that represents the type of information for which 9869 genotype statistics are calculated. It is used to generate various VCF info tags for the 9870 statistics, such as the number of occurrences, the list of values, the minimum value, the 9871 maximum value, the mean, the median, defaults to VAF 9872 :type info: str (optional) 9873 """ 9874 9875 # if FORMAT and samples 9876 if ( 9877 "FORMAT" in self.get_header_columns_as_list() 9878 and self.get_header_sample_list() 9879 ): 9880 9881 # vaf_stats annotation field 9882 vaf_stats_tag = info + "_stats" 9883 9884 # VCF infos tags 9885 vcf_infos_tags = { 9886 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9887 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9888 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9889 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9890 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9891 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9892 info 9893 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9894 } 9895 9896 # Prefix 9897 prefix = self.get_explode_infos_prefix() 9898 9899 # Field 9900 vaf_stats_infos = prefix + vaf_stats_tag 9901 9902 # Variants table 9903 table_variants = self.get_table_variants() 9904 9905 # Header 9906 vcf_reader = self.get_header() 9907 9908 # Create variant id 9909 variant_id_column = self.get_variant_id_column() 9910 added_columns = [variant_id_column] 9911 9912 # variant_id, FORMAT and samples 9913 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9914 self.get_header_sample_list() 9915 ) 9916 9917 # Create dataframe 9918 dataframe_vaf_stats = self.get_query_to_df( 9919 f""" SELECT {samples_fields} FROM {table_variants} """ 9920 ) 9921 9922 # Create vaf_stats column 9923 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9924 lambda row: genotype_stats( 9925 row, samples=self.get_header_sample_list(), info=info 9926 ), 9927 axis=1, 9928 ) 9929 9930 # List of vcf tags 9931 sql_vaf_stats_fields = [] 9932 9933 # Check all VAF stats infos 9934 for stat in vcf_infos_tags: 9935 9936 # Extract stats 9937 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9938 lambda x: dict(x).get(stat, "") 9939 ) 9940 9941 # Add snpeff_hgvs to header 9942 vcf_reader.infos[stat] = vcf.parser._Info( 9943 stat, 9944 ".", 9945 "String", 9946 vcf_infos_tags.get(stat, "genotype statistics"), 9947 "howard calculation", 9948 "0", 9949 self.code_type_map.get("String"), 9950 ) 9951 9952 if len(sql_vaf_stats_fields): 9953 sep = ";" 9954 else: 9955 sep = "" 9956 9957 # Create fields to add in INFO 9958 sql_vaf_stats_fields.append( 9959 f""" 9960 CASE 9961 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9962 THEN concat( 9963 '{sep}{stat}=', 9964 dataframe_vaf_stats."{stat}" 9965 ) 9966 ELSE '' 9967 END 9968 """ 9969 ) 9970 9971 # SQL set for update 9972 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9973 9974 # Update 9975 sql_update = f""" 9976 UPDATE {table_variants} 9977 SET "INFO" = 9978 concat( 9979 CASE 9980 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9981 THEN '' 9982 ELSE concat("INFO", ';') 9983 END, 9984 {sql_vaf_stats_fields_set} 9985 ) 9986 FROM dataframe_vaf_stats 9987 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9988 9989 """ 9990 self.conn.execute(sql_update) 9991 9992 # Remove added columns 9993 for added_column in added_columns: 9994 self.drop_column(column=added_column) 9995 9996 # Delete dataframe 9997 del dataframe_vaf_stats 9998 gc.collect() 9999 10000 def calculation_transcripts_annotation( 10001 self, info_json: str = None, info_format: str = None 10002 ) -> None: 10003 """ 10004 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10005 field to it if transcripts are available. 10006 10007 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10008 is a string parameter that represents the information field to be used in the transcripts JSON. 10009 It is used to specify the JSON format for the transcripts information. If no value is provided 10010 when calling the method, it defaults to " 10011 :type info_json: str 10012 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10013 method is a string parameter that specifies the format of the information field to be used in 10014 the transcripts JSON. It is used to define the format of the information field 10015 :type info_format: str 10016 """ 10017 10018 # Create transcripts table 10019 transcripts_table = self.create_transcript_view() 10020 10021 # Add info field 10022 if transcripts_table: 10023 self.transcript_view_to_variants( 10024 transcripts_table=transcripts_table, 10025 transcripts_info_field_json=info_json, 10026 transcripts_info_field_format=info_format, 10027 ) 10028 else: 10029 log.info("No Transcripts to process. Check param.json file configuration") 10030 10031 def calculation_transcripts_prioritization(self) -> None: 10032 """ 10033 The function `calculation_transcripts_prioritization` creates a transcripts table and 10034 prioritizes transcripts based on certain criteria. 10035 """ 10036 10037 # Create transcripts table 10038 transcripts_table = self.create_transcript_view() 10039 10040 # Add info field 10041 if transcripts_table: 10042 self.transcripts_prioritization(transcripts_table=transcripts_table) 10043 else: 10044 log.info("No Transcripts to process. Check param.json file configuration") 10045 10046 def calculation_transcripts_export(self) -> None: 10047 """ """ 10048 10049 # Create transcripts table 10050 transcripts_table = self.create_transcript_view() 10051 10052 # Add info field 10053 if transcripts_table: 10054 self.transcripts_export(transcripts_table=transcripts_table) 10055 else: 10056 log.info("No Transcripts to process. Check param.json file configuration") 10057 10058 ############### 10059 # Transcripts # 10060 ############### 10061 10062 def transcripts_export( 10063 self, transcripts_table: str = None, param: dict = {} 10064 ) -> bool: 10065 """ """ 10066 10067 log.debug("Start transcripts export...") 10068 10069 # Param 10070 if not param: 10071 param = self.get_param() 10072 10073 # Param export 10074 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10075 10076 # Output file 10077 transcripts_export_output = param_transcript_export.get("output", None) 10078 10079 if not param_transcript_export or not transcripts_export_output: 10080 log.warning(f"No transcriipts export parameters defined!") 10081 return False 10082 10083 # List of transcripts annotations 10084 query_describe = f""" 10085 SELECT column_name 10086 FROM ( 10087 DESCRIBE SELECT * FROM {transcripts_table} 10088 ) 10089 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10090 """ 10091 transcripts_annotations_list = list( 10092 self.get_query_to_df(query=query_describe)["column_name"] 10093 ) 10094 10095 # Create transcripts table for export 10096 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10097 random.choices(string.ascii_uppercase + string.digits, k=10) 10098 ) 10099 query_create_transcripts_table_export = f""" 10100 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10101 """ 10102 self.execute_query(query=query_create_transcripts_table_export) 10103 10104 # Output file format 10105 transcripts_export_output_format = get_file_format( 10106 filename=transcripts_export_output 10107 ) 10108 10109 # Format VCF - construct INFO 10110 if transcripts_export_output_format in ["vcf"]: 10111 10112 # Construct query update INFO and header 10113 query_update_info = [] 10114 for field in transcripts_annotations_list: 10115 10116 # If field not in header 10117 if field not in self.get_header_infos_list(): 10118 10119 # Add PZ Transcript in header 10120 self.get_header().infos[field] = vcf.parser._Info( 10121 field, 10122 ".", 10123 "String", 10124 f"Annotation '{field}' from transcript view", 10125 "unknown", 10126 "unknown", 10127 0, 10128 ) 10129 10130 # Add field as INFO/tag 10131 query_update_info.append( 10132 f""" 10133 CASE 10134 WHEN "{field}" IS NOT NULL 10135 THEN concat('{field}=', "{field}", ';') 10136 ELSE '' 10137 END 10138 """ 10139 ) 10140 10141 # Query param 10142 query_update_info_value = ( 10143 f""" concat('', {", ".join(query_update_info)}) """ 10144 ) 10145 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10146 10147 else: 10148 10149 # Query param 10150 query_update_info_value = f""" NULL """ 10151 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10152 10153 # Update query INFO column 10154 query_update = f""" 10155 UPDATE {transcripts_table_export} 10156 SET INFO = {query_update_info_value} 10157 10158 """ 10159 self.execute_query(query=query_update) 10160 10161 # Export 10162 self.export_output( 10163 output_file=transcripts_export_output, 10164 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10165 ) 10166 10167 # Drop transcripts export table 10168 query_drop_transcripts_table_export = f""" 10169 DROP TABLE {transcripts_table_export} 10170 """ 10171 self.execute_query(query=query_drop_transcripts_table_export) 10172 10173 def transcripts_prioritization( 10174 self, transcripts_table: str = None, param: dict = {} 10175 ) -> bool: 10176 """ 10177 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10178 and updates the variants table with the prioritized information. 10179 10180 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10181 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10182 This parameter is used to identify the table where the transcripts data is stored for the 10183 prioritization process 10184 :type transcripts_table: str 10185 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10186 that contains various configuration settings for the prioritization process of transcripts. It 10187 is used to customize the behavior of the prioritization algorithm and includes settings such as 10188 the prefix for prioritization fields, default profiles, and other 10189 :type param: dict 10190 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10191 transcripts prioritization process is successfully completed, and `False` if there are any 10192 issues or if no profile is defined for transcripts prioritization. 10193 """ 10194 10195 log.debug("Start transcripts prioritization...") 10196 10197 # Param 10198 if not param: 10199 param = self.get_param() 10200 10201 # Variants table 10202 table_variants = self.get_table_variants() 10203 10204 # Transcripts table 10205 if transcripts_table is None: 10206 transcripts_table = self.create_transcript_view( 10207 transcripts_table="transcripts", param=param 10208 ) 10209 if transcripts_table is None: 10210 msg_err = "No Transcripts table availalble" 10211 log.error(msg_err) 10212 raise ValueError(msg_err) 10213 log.debug(f"transcripts_table={transcripts_table}") 10214 10215 # Get transcripts columns 10216 columns_as_list_query = f""" 10217 DESCRIBE {transcripts_table} 10218 """ 10219 columns_as_list = list( 10220 self.get_query_to_df(columns_as_list_query)["column_name"] 10221 ) 10222 10223 # Create INFO if not exists 10224 if "INFO" not in columns_as_list: 10225 query_add_info = f""" 10226 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10227 """ 10228 self.execute_query(query_add_info) 10229 10230 # Prioritization param and Force only PZ Score and Flag 10231 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10232 10233 # PZ profile by default 10234 pz_profile_default = ( 10235 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10236 ) 10237 10238 # Exit if no profile 10239 if pz_profile_default is None: 10240 log.warning("No profile defined for transcripts prioritization") 10241 return False 10242 10243 # PZ fields 10244 pz_param_pzfields = {} 10245 10246 # PZ field transcripts 10247 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10248 10249 # Add PZ Transcript in header 10250 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10251 pz_fields_transcripts, 10252 ".", 10253 "String", 10254 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10255 "unknown", 10256 "unknown", 10257 code_type_map["String"], 10258 ) 10259 10260 # Mandatory fields 10261 pz_mandatory_fields_list = [ 10262 "Score", 10263 "Flag", 10264 "Tags", 10265 "Comment", 10266 "Infos", 10267 "Class", 10268 ] 10269 pz_mandatory_fields = [] 10270 for pz_mandatory_field in pz_mandatory_fields_list: 10271 pz_mandatory_fields.append( 10272 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10273 ) 10274 10275 # PZ fields in param 10276 for pz_field in pz_param.get("pzfields", []): 10277 if pz_field in pz_mandatory_fields_list: 10278 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10279 pz_param.get("pzprefix", "PTZ") + pz_field 10280 ) 10281 else: 10282 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10283 pz_param_pzfields[pz_field] = pz_field_new 10284 10285 # Add PZ Transcript in header 10286 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10287 pz_field_new, 10288 ".", 10289 "String", 10290 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10291 "unknown", 10292 "unknown", 10293 code_type_map["String"], 10294 ) 10295 10296 # PZ fields param 10297 pz_param["pzfields"] = pz_mandatory_fields 10298 10299 # Prioritization 10300 prioritization_result = self.prioritization( 10301 table=transcripts_table, 10302 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10303 ) 10304 if not prioritization_result: 10305 log.warning("Transcripts prioritization not processed") 10306 return False 10307 10308 # PZ fields sql query 10309 query_update_select_list = [] 10310 query_update_concat_list = [] 10311 query_update_order_list = [] 10312 for pz_param_pzfield in set( 10313 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10314 ): 10315 query_update_select_list.append(f" {pz_param_pzfield}, ") 10316 10317 for pz_param_pzfield in pz_param_pzfields: 10318 query_update_concat_list.append( 10319 f""" 10320 , CASE 10321 WHEN {pz_param_pzfield} IS NOT NULL 10322 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10323 ELSE '' 10324 END 10325 """ 10326 ) 10327 10328 # Order by 10329 pz_orders = ( 10330 param.get("transcripts", {}) 10331 .get("prioritization", {}) 10332 .get("prioritization_transcripts_order", {}) 10333 ) 10334 if not pz_orders: 10335 pz_orders = { 10336 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10337 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10338 } 10339 for pz_order in pz_orders: 10340 query_update_order_list.append( 10341 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10342 ) 10343 10344 # Fields to explode 10345 fields_to_explode = ( 10346 list(pz_param_pzfields.keys()) 10347 + pz_mandatory_fields 10348 + list(pz_orders.keys()) 10349 ) 10350 # Remove transcript column as a specific transcript column 10351 if "transcript" in fields_to_explode: 10352 fields_to_explode.remove("transcript") 10353 10354 # Fields intranscripts table 10355 query_transcripts_table = f""" 10356 DESCRIBE SELECT * FROM {transcripts_table} 10357 """ 10358 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10359 10360 # Check fields to explode 10361 for field_to_explode in fields_to_explode: 10362 if field_to_explode not in self.get_header_infos_list() + list( 10363 query_transcripts_table.column_name 10364 ): 10365 msg_err = f"INFO/{field_to_explode} NOT IN header" 10366 log.error(msg_err) 10367 raise ValueError(msg_err) 10368 10369 # Explode fields to explode 10370 self.explode_infos( 10371 table=transcripts_table, 10372 fields=fields_to_explode, 10373 ) 10374 10375 # Transcript preference file 10376 transcripts_preference_file = ( 10377 param.get("transcripts", {}) 10378 .get("prioritization", {}) 10379 .get("prioritization_transcripts", {}) 10380 ) 10381 transcripts_preference_file = full_path(transcripts_preference_file) 10382 10383 # Transcript preference forced 10384 transcript_preference_force = ( 10385 param.get("transcripts", {}) 10386 .get("prioritization", {}) 10387 .get("prioritization_transcripts_force", False) 10388 ) 10389 # Transcript version forced 10390 transcript_version_force = ( 10391 param.get("transcripts", {}) 10392 .get("prioritization", {}) 10393 .get("prioritization_transcripts_version_force", False) 10394 ) 10395 10396 # Transcripts Ranking 10397 if transcripts_preference_file: 10398 10399 # Transcripts file to dataframe 10400 if os.path.exists(transcripts_preference_file): 10401 transcripts_preference_dataframe = transcripts_file_to_df( 10402 transcripts_preference_file 10403 ) 10404 else: 10405 log.error( 10406 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10407 ) 10408 raise ValueError( 10409 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10410 ) 10411 10412 # Order by depending to transcript preference forcing 10413 if transcript_preference_force: 10414 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10415 else: 10416 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10417 10418 # Transcript columns joined depend on version consideration 10419 if transcript_version_force: 10420 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10421 else: 10422 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10423 10424 # Query ranking for update 10425 query_update_ranking = f""" 10426 SELECT 10427 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10428 ROW_NUMBER() OVER ( 10429 PARTITION BY "#CHROM", POS, REF, ALT 10430 ORDER BY {order_by} 10431 ) AS rn 10432 FROM {transcripts_table} 10433 LEFT JOIN 10434 ( 10435 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10436 FROM transcripts_preference_dataframe 10437 ) AS transcripts_preference 10438 ON {transcripts_version_join} 10439 """ 10440 10441 else: 10442 10443 # Query ranking for update 10444 query_update_ranking = f""" 10445 SELECT 10446 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10447 ROW_NUMBER() OVER ( 10448 PARTITION BY "#CHROM", POS, REF, ALT 10449 ORDER BY {" , ".join(query_update_order_list)} 10450 ) AS rn 10451 FROM {transcripts_table} 10452 """ 10453 10454 # Export Transcripts prioritization infos to variants table 10455 query_update = f""" 10456 WITH RankedTranscripts AS ( 10457 {query_update_ranking} 10458 ) 10459 UPDATE {table_variants} 10460 SET 10461 INFO = CONCAT(CASE 10462 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10463 THEN '' 10464 ELSE concat("INFO", ';') 10465 END, 10466 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10467 ) 10468 FROM 10469 RankedTranscripts 10470 WHERE 10471 rn = 1 10472 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10473 AND variants."POS" = RankedTranscripts."POS" 10474 AND variants."REF" = RankedTranscripts."REF" 10475 AND variants."ALT" = RankedTranscripts."ALT" 10476 """ 10477 10478 # log.debug(f"query_update={query_update}") 10479 self.execute_query(query=query_update) 10480 10481 # Return 10482 return True 10483 10484 def create_transcript_view_from_columns_map( 10485 self, 10486 transcripts_table: str = "transcripts", 10487 columns_maps: dict = {}, 10488 added_columns: list = [], 10489 temporary_tables: list = None, 10490 annotation_fields: list = None, 10491 column_rename: dict = {}, 10492 column_clean: bool = False, 10493 column_case: str = None, 10494 ) -> tuple[list, list, list]: 10495 """ 10496 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10497 specified columns mapping for transcripts data. 10498 10499 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10500 of the table where the transcripts data is stored or will be stored in the database. This table 10501 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10502 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10503 :type transcripts_table: str (optional) 10504 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10505 about how to map columns from a transcripts table to create a view. Each entry in the 10506 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10507 typically includes details such as the main transcript column and additional information columns 10508 :type columns_maps: dict 10509 :param added_columns: The `added_columns` parameter in the 10510 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10511 that will be added to the view being created based on the columns map provided. These columns 10512 are generated by exploding the transcript information columns along with the main transcript 10513 column 10514 :type added_columns: list 10515 :param temporary_tables: The `temporary_tables` parameter in the 10516 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10517 tables created during the process of creating a transcript view from a columns map. These 10518 temporary tables are used to store intermediate results or transformations before the final view 10519 is generated 10520 :type temporary_tables: list 10521 :param annotation_fields: The `annotation_fields` parameter in the 10522 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10523 used for annotation in the query view creation process. These fields are extracted from the 10524 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10525 :type annotation_fields: list 10526 :param column_rename: The `column_rename` parameter in the 10527 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10528 custom renaming for columns during the creation of the temporary table view. This parameter 10529 provides a mapping of original column names to the desired renamed column names. By using this 10530 parameter, 10531 :type column_rename: dict 10532 :param column_clean: The `column_clean` parameter in the 10533 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10534 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10535 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10536 False 10537 :type column_clean: bool (optional) 10538 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10539 function is used to specify the case transformation to be applied to the columns during the view 10540 creation process. It allows you to control whether the column values should be converted to 10541 lowercase, uppercase, or remain unchanged 10542 :type column_case: str 10543 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10544 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10545 """ 10546 10547 log.debug("Start transcrpts view creation from columns map...") 10548 10549 # "from_columns_map": [ 10550 # { 10551 # "transcripts_column": "Ensembl_transcriptid", 10552 # "transcripts_infos_columns": [ 10553 # "genename", 10554 # "Ensembl_geneid", 10555 # "LIST_S2_score", 10556 # "LIST_S2_pred", 10557 # ], 10558 # }, 10559 # { 10560 # "transcripts_column": "Ensembl_transcriptid", 10561 # "transcripts_infos_columns": [ 10562 # "genename", 10563 # "VARITY_R_score", 10564 # "Aloft_pred", 10565 # ], 10566 # }, 10567 # ], 10568 10569 # Init 10570 if temporary_tables is None: 10571 temporary_tables = [] 10572 if annotation_fields is None: 10573 annotation_fields = [] 10574 10575 # Variants table 10576 table_variants = self.get_table_variants() 10577 10578 for columns_map in columns_maps: 10579 10580 # Transcript column 10581 transcripts_column = columns_map.get("transcripts_column", None) 10582 10583 # Transcripts infos columns 10584 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10585 10586 # Transcripts infos columns rename 10587 column_rename = columns_map.get("column_rename", column_rename) 10588 10589 # Transcripts infos columns clean 10590 column_clean = columns_map.get("column_clean", column_clean) 10591 10592 # Transcripts infos columns case 10593 column_case = columns_map.get("column_case", column_case) 10594 10595 if transcripts_column is not None: 10596 10597 # Explode 10598 added_columns += self.explode_infos( 10599 fields=[transcripts_column] + transcripts_infos_columns 10600 ) 10601 10602 # View clauses 10603 clause_select_variants = [] 10604 clause_select_tanscripts = [] 10605 for field in [transcripts_column] + transcripts_infos_columns: 10606 10607 # AS field 10608 as_field = field 10609 10610 # Rename 10611 if column_rename: 10612 as_field = column_rename.get(as_field, as_field) 10613 10614 # Clean 10615 if column_clean: 10616 as_field = clean_annotation_field(as_field) 10617 10618 # Case 10619 if column_case: 10620 if column_case.lower() in ["lower"]: 10621 as_field = as_field.lower() 10622 elif column_case.lower() in ["upper"]: 10623 as_field = as_field.upper() 10624 10625 # Clause select Variants 10626 clause_select_variants.append( 10627 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10628 ) 10629 10630 if field in [transcripts_column]: 10631 clause_select_tanscripts.append( 10632 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10633 ) 10634 else: 10635 clause_select_tanscripts.append( 10636 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10637 ) 10638 annotation_fields.append(as_field) 10639 10640 # Querey View 10641 query = f""" 10642 SELECT 10643 "#CHROM", POS, REF, ALT, INFO, 10644 "{transcripts_column}" AS 'transcript', 10645 {", ".join(clause_select_tanscripts)} 10646 FROM ( 10647 SELECT 10648 "#CHROM", POS, REF, ALT, INFO, 10649 {", ".join(clause_select_variants)} 10650 FROM {table_variants} 10651 ) 10652 WHERE "{transcripts_column}" IS NOT NULL 10653 """ 10654 10655 # Create temporary table 10656 temporary_table = transcripts_table + "".join( 10657 random.choices(string.ascii_uppercase + string.digits, k=10) 10658 ) 10659 10660 # Temporary_tables 10661 temporary_tables.append(temporary_table) 10662 query_view = f""" 10663 CREATE TEMPORARY TABLE {temporary_table} 10664 AS ({query}) 10665 """ 10666 self.execute_query(query=query_view) 10667 10668 return added_columns, temporary_tables, annotation_fields 10669 10670 def create_transcript_view_from_column_format( 10671 self, 10672 transcripts_table: str = "transcripts", 10673 column_formats: dict = {}, 10674 temporary_tables: list = None, 10675 annotation_fields: list = None, 10676 column_rename: dict = {}, 10677 column_clean: bool = False, 10678 column_case: str = None, 10679 ) -> tuple[list, list, list]: 10680 """ 10681 The `create_transcript_view_from_column_format` function generates a transcript view based on 10682 specified column formats, adds additional columns and annotation fields, and returns the list of 10683 temporary tables and annotation fields. 10684 10685 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10686 of the table containing the transcripts data. This table will be used as the base table for 10687 creating the transcript view. The default value for this parameter is "transcripts", but you can 10688 provide a different table name if needed, defaults to transcripts 10689 :type transcripts_table: str (optional) 10690 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10691 about the columns to be used for creating the transcript view. Each entry in the dictionary 10692 specifies the mapping between a transcripts column and a transcripts infos column. This 10693 parameter allows you to define how the columns from the transcripts table should be transformed 10694 or mapped 10695 :type column_formats: dict 10696 :param temporary_tables: The `temporary_tables` parameter in the 10697 `create_transcript_view_from_column_format` function is a list that stores the names of 10698 temporary views created during the process of creating a transcript view from a column format. 10699 These temporary views are used to manipulate and extract data before generating the final 10700 transcript view 10701 :type temporary_tables: list 10702 :param annotation_fields: The `annotation_fields` parameter in the 10703 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10704 that are extracted from the temporary views created during the process. These annotation fields 10705 are obtained by querying the temporary views and extracting the column names excluding specific 10706 columns like `#CH 10707 :type annotation_fields: list 10708 :param column_rename: The `column_rename` parameter in the 10709 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10710 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10711 column names to new column names in this dictionary, you can rename specific columns during the 10712 process 10713 :type column_rename: dict 10714 :param column_clean: The `column_clean` parameter in the 10715 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10716 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10717 will be cleaned during the creation of the transcript view based on the specified column format, 10718 defaults to False 10719 :type column_clean: bool (optional) 10720 :param column_case: The `column_case` parameter in the 10721 `create_transcript_view_from_column_format` function is used to specify the case transformation 10722 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10723 to convert the column names to uppercase or lowercase, respectively 10724 :type column_case: str 10725 :return: The `create_transcript_view_from_column_format` function returns two lists: 10726 `temporary_tables` and `annotation_fields`. 10727 """ 10728 10729 log.debug("Start transcrpts view creation from column format...") 10730 10731 # "from_column_format": [ 10732 # { 10733 # "transcripts_column": "ANN", 10734 # "transcripts_infos_column": "Feature_ID", 10735 # } 10736 # ], 10737 10738 # Init 10739 if temporary_tables is None: 10740 temporary_tables = [] 10741 if annotation_fields is None: 10742 annotation_fields = [] 10743 10744 for column_format in column_formats: 10745 10746 # annotation field and transcript annotation field 10747 annotation_field = column_format.get("transcripts_column", "ANN") 10748 transcript_annotation = column_format.get( 10749 "transcripts_infos_column", "Feature_ID" 10750 ) 10751 10752 # Transcripts infos columns rename 10753 column_rename = column_format.get("column_rename", column_rename) 10754 10755 # Transcripts infos columns clean 10756 column_clean = column_format.get("column_clean", column_clean) 10757 10758 # Transcripts infos columns case 10759 column_case = column_format.get("column_case", column_case) 10760 10761 # Temporary View name 10762 temporary_view_name = transcripts_table + "".join( 10763 random.choices(string.ascii_uppercase + string.digits, k=10) 10764 ) 10765 10766 # Create temporary view name 10767 temporary_view_name = self.annotation_format_to_table( 10768 uniquify=True, 10769 annotation_field=annotation_field, 10770 view_name=temporary_view_name, 10771 annotation_id=transcript_annotation, 10772 column_rename=column_rename, 10773 column_clean=column_clean, 10774 column_case=column_case, 10775 ) 10776 10777 # Annotation fields 10778 if temporary_view_name: 10779 query_annotation_fields = f""" 10780 SELECT * 10781 FROM ( 10782 DESCRIBE SELECT * 10783 FROM {temporary_view_name} 10784 ) 10785 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10786 """ 10787 df_annotation_fields = self.get_query_to_df( 10788 query=query_annotation_fields 10789 ) 10790 10791 # Add temporary view and annotation fields 10792 temporary_tables.append(temporary_view_name) 10793 annotation_fields += list(set(df_annotation_fields["column_name"])) 10794 10795 return temporary_tables, annotation_fields 10796 10797 def create_transcript_view( 10798 self, 10799 transcripts_table: str = None, 10800 transcripts_table_drop: bool = False, 10801 param: dict = {}, 10802 ) -> str: 10803 """ 10804 The `create_transcript_view` function generates a transcript view by processing data from a 10805 specified table based on provided parameters and structural information. 10806 10807 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10808 is used to specify the name of the table that will store the final transcript view data. If a table 10809 name is not provided, the function will create a new table to store the transcript view data, and by 10810 default,, defaults to transcripts 10811 :type transcripts_table: str (optional) 10812 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10813 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10814 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10815 the function will drop the existing transcripts table if it exists, defaults to False 10816 :type transcripts_table_drop: bool (optional) 10817 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10818 contains information needed to create a transcript view. It includes details such as the structure 10819 of the transcripts, columns mapping, column formats, and other necessary information for generating 10820 the view. This parameter allows for flexibility and customization 10821 :type param: dict 10822 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10823 created or modified during the execution of the function. 10824 """ 10825 10826 log.debug("Start transcripts view creation...") 10827 10828 # Default 10829 transcripts_table_default = "transcripts" 10830 10831 # Param 10832 if not param: 10833 param = self.get_param() 10834 10835 # Struct 10836 struct = param.get("transcripts", {}).get("struct", None) 10837 10838 # Transcript veresion 10839 transcript_id_remove_version = param.get("transcripts", {}).get( 10840 "transcript_id_remove_version", False 10841 ) 10842 10843 # Transcripts mapping 10844 transcript_id_mapping_file = param.get("transcripts", {}).get( 10845 "transcript_id_mapping_file", None 10846 ) 10847 10848 # Transcripts mapping 10849 transcript_id_mapping_force = param.get("transcripts", {}).get( 10850 "transcript_id_mapping_force", None 10851 ) 10852 10853 if struct: 10854 10855 # Transcripts table 10856 if transcripts_table is None: 10857 transcripts_table = param.get("transcripts", {}).get( 10858 "table", transcripts_table_default 10859 ) 10860 10861 # added_columns 10862 added_columns = [] 10863 10864 # Temporary tables 10865 temporary_tables = [] 10866 10867 # Annotation fields 10868 annotation_fields = [] 10869 10870 # from columns map 10871 columns_maps = struct.get("from_columns_map", []) 10872 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10873 self.create_transcript_view_from_columns_map( 10874 transcripts_table=transcripts_table, 10875 columns_maps=columns_maps, 10876 added_columns=added_columns, 10877 temporary_tables=temporary_tables, 10878 annotation_fields=annotation_fields, 10879 ) 10880 ) 10881 added_columns += added_columns_tmp 10882 temporary_tables += temporary_tables_tmp 10883 annotation_fields += annotation_fields_tmp 10884 10885 # from column format 10886 column_formats = struct.get("from_column_format", []) 10887 temporary_tables_tmp, annotation_fields_tmp = ( 10888 self.create_transcript_view_from_column_format( 10889 transcripts_table=transcripts_table, 10890 column_formats=column_formats, 10891 temporary_tables=temporary_tables, 10892 annotation_fields=annotation_fields, 10893 ) 10894 ) 10895 temporary_tables += temporary_tables_tmp 10896 annotation_fields += annotation_fields_tmp 10897 10898 # Remove some specific fields/column 10899 annotation_fields = list(set(annotation_fields)) 10900 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10901 if field in annotation_fields: 10902 annotation_fields.remove(field) 10903 10904 # Merge temporary tables query 10905 query_merge = "" 10906 for temporary_table in list(set(temporary_tables)): 10907 10908 # First temporary table 10909 if not query_merge: 10910 query_merge = f""" 10911 SELECT * FROM {temporary_table} 10912 """ 10913 # other temporary table (using UNION) 10914 else: 10915 query_merge += f""" 10916 UNION BY NAME SELECT * FROM {temporary_table} 10917 """ 10918 10919 # transcript table tmp 10920 transcript_table_tmp = "transcripts_tmp" 10921 transcript_table_tmp2 = "transcripts_tmp2" 10922 transcript_table_tmp3 = "transcripts_tmp3" 10923 10924 # Merge on transcript 10925 query_merge_on_transcripts_annotation_fields = [] 10926 10927 # Add transcript list 10928 query_merge_on_transcripts_annotation_fields.append( 10929 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10930 ) 10931 10932 # Aggregate all annotations fields 10933 for annotation_field in set(annotation_fields): 10934 query_merge_on_transcripts_annotation_fields.append( 10935 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10936 ) 10937 10938 # Transcripts mapping 10939 if transcript_id_mapping_file: 10940 10941 # Transcript dataframe 10942 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10943 transcript_id_mapping_dataframe = transcripts_file_to_df( 10944 transcript_id_mapping_file, column_names=["transcript", "alias"] 10945 ) 10946 10947 # Transcript version remove 10948 if transcript_id_remove_version: 10949 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10950 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10951 query_left_join = f""" 10952 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10953 """ 10954 else: 10955 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10956 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10957 query_left_join = f""" 10958 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10959 """ 10960 10961 # Transcript column for group by merge 10962 query_transcript_merge_group_by = """ 10963 CASE 10964 WHEN transcript_mapped NOT IN ('') 10965 THEN split_part(transcript_mapped, '.', 1) 10966 ELSE split_part(transcript_original, '.', 1) 10967 END 10968 """ 10969 10970 # Merge query 10971 transcripts_tmp2_query = f""" 10972 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10973 FROM ({query_merge}) AS {transcript_table_tmp} 10974 {query_left_join} 10975 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10976 """ 10977 10978 # Retrive columns after mege 10979 transcripts_tmp2_describe_query = f""" 10980 DESCRIBE {transcripts_tmp2_query} 10981 """ 10982 transcripts_tmp2_describe_list = list( 10983 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10984 "column_name" 10985 ] 10986 ) 10987 10988 # Create list of columns for select clause 10989 transcripts_tmp2_describe_select_clause = [] 10990 for field in transcripts_tmp2_describe_list: 10991 if field not in [ 10992 "#CHROM", 10993 "POS", 10994 "REF", 10995 "ALT", 10996 "INFO", 10997 "transcript_mapped", 10998 ]: 10999 as_field = field 11000 if field in ["transcript_original"]: 11001 as_field = "transcripts_mapped" 11002 transcripts_tmp2_describe_select_clause.append( 11003 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11004 ) 11005 11006 # Merge with mapping 11007 query_merge_on_transcripts = f""" 11008 SELECT 11009 "#CHROM", POS, REF, ALT, INFO, 11010 CASE 11011 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11012 THEN ANY_VALUE(transcript_mapped) 11013 ELSE ANY_VALUE(transcript_original) 11014 END AS transcript, 11015 {", ".join(transcripts_tmp2_describe_select_clause)} 11016 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11017 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11018 {query_transcript_merge_group_by} 11019 """ 11020 11021 # Add transcript filter from mapping file 11022 if transcript_id_mapping_force: 11023 query_merge_on_transcripts = f""" 11024 SELECT * 11025 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11026 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11027 """ 11028 11029 # No transcript mapping 11030 else: 11031 11032 # Remove transcript version 11033 if transcript_id_remove_version: 11034 query_transcript_column = f""" 11035 split_part({transcript_table_tmp}.transcript, '.', 1) 11036 """ 11037 else: 11038 query_transcript_column = """ 11039 transcript 11040 """ 11041 11042 # Query sections 11043 query_transcript_column_select = ( 11044 f"{query_transcript_column} AS transcript" 11045 ) 11046 query_transcript_column_group_by = query_transcript_column 11047 11048 # Query for transcripts view 11049 query_merge_on_transcripts = f""" 11050 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11051 FROM ({query_merge}) AS {transcript_table_tmp} 11052 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11053 """ 11054 11055 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11056 11057 # Drop transcript view is necessary 11058 if transcripts_table_drop: 11059 query_drop = f""" 11060 DROP TABLE IF EXISTS {transcripts_table}; 11061 """ 11062 self.execute_query(query=query_drop) 11063 11064 # Merge and create transcript view 11065 query_create_view = f""" 11066 CREATE TABLE IF NOT EXISTS {transcripts_table} 11067 AS {query_merge_on_transcripts} 11068 """ 11069 self.execute_query(query=query_create_view) 11070 11071 # Remove added columns 11072 for added_column in added_columns: 11073 self.drop_column(column=added_column) 11074 11075 else: 11076 11077 transcripts_table = None 11078 11079 return transcripts_table 11080 11081 def annotation_format_to_table( 11082 self, 11083 uniquify: bool = True, 11084 annotation_field: str = "ANN", 11085 annotation_id: str = "Feature_ID", 11086 view_name: str = "transcripts", 11087 column_rename: dict = {}, 11088 column_clean: bool = False, 11089 column_case: str = None, 11090 ) -> str: 11091 """ 11092 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11093 structured table format, ensuring unique values and creating a temporary table for further 11094 processing or analysis. 11095 11096 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11097 unique values in the output or not. If set to `True`, the function will make sure that the 11098 output values are unique, defaults to True 11099 :type uniquify: bool (optional) 11100 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11101 that contains the annotation information for each variant. This field is used to extract the 11102 annotation details for further processing in the function. By default, it is set to "ANN", 11103 defaults to ANN 11104 :type annotation_field: str (optional) 11105 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11106 is used to specify the identifier for the annotation feature. This identifier will be used as a 11107 column name in the resulting table or view that is created based on the annotation data. It 11108 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11109 :type annotation_id: str (optional) 11110 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11111 to specify the name of the temporary table that will be created to store the transformed 11112 annotation data. This table will hold the extracted information from the annotation field in a 11113 structured format for further processing or analysis. By default,, defaults to transcripts 11114 :type view_name: str (optional) 11115 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11116 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11117 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11118 created based on the annotation data. This feature enables 11119 :type column_rename: dict 11120 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11121 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11122 If set to `True`, the function will clean the annotation field before further processing. This 11123 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11124 to False 11125 :type column_clean: bool (optional) 11126 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11127 used to specify the case transformation to be applied to the column names extracted from the 11128 annotation data. It allows you to set the case of the column names to either lowercase or 11129 uppercase for consistency or other specific requirements during the conversion 11130 :type column_case: str 11131 :return: The function `annotation_format_to_table` is returning the name of the view created, 11132 which is stored in the variable `view_name`. 11133 """ 11134 11135 # Annotation field 11136 annotation_format = "annotation_explode" 11137 11138 # Transcript annotation 11139 if column_rename: 11140 annotation_id = column_rename.get(annotation_id, annotation_id) 11141 11142 if column_clean: 11143 annotation_id = clean_annotation_field(annotation_id) 11144 11145 # Prefix 11146 prefix = self.get_explode_infos_prefix() 11147 if prefix: 11148 prefix = "INFO/" 11149 11150 # Annotation fields 11151 annotation_infos = prefix + annotation_field 11152 annotation_format_infos = prefix + annotation_format 11153 11154 # Variants table 11155 table_variants = self.get_table_variants() 11156 11157 # Header 11158 vcf_reader = self.get_header() 11159 11160 # Add columns 11161 added_columns = [] 11162 11163 # Explode HGVS field in column 11164 added_columns += self.explode_infos(fields=[annotation_field]) 11165 11166 if annotation_field in vcf_reader.infos: 11167 11168 # Extract ANN header 11169 ann_description = vcf_reader.infos[annotation_field].desc 11170 pattern = r"'(.+?)'" 11171 match = re.search(pattern, ann_description) 11172 if match: 11173 ann_header_match = match.group(1).split(" | ") 11174 ann_header = [] 11175 ann_header_desc = {} 11176 for i in range(len(ann_header_match)): 11177 ann_header_info = "".join( 11178 char for char in ann_header_match[i] if char.isalnum() 11179 ) 11180 ann_header.append(ann_header_info) 11181 ann_header_desc[ann_header_info] = ann_header_match[i] 11182 if not ann_header_desc: 11183 raise ValueError("Invalid header description format") 11184 else: 11185 raise ValueError("Invalid header description format") 11186 11187 # Create variant id 11188 variant_id_column = self.get_variant_id_column() 11189 added_columns += [variant_id_column] 11190 11191 # Create dataframe 11192 dataframe_annotation_format = self.get_query_to_df( 11193 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11194 ) 11195 11196 # Create annotation columns 11197 dataframe_annotation_format[ 11198 annotation_format_infos 11199 ] = dataframe_annotation_format[annotation_infos].apply( 11200 lambda x: explode_annotation_format( 11201 annotation=str(x), 11202 uniquify=uniquify, 11203 output_format="JSON", 11204 prefix="", 11205 header=list(ann_header_desc.values()), 11206 ) 11207 ) 11208 11209 # Find keys 11210 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11211 df_keys = self.get_query_to_df(query=query_json) 11212 11213 # Check keys 11214 query_json_key = [] 11215 for _, row in df_keys.iterrows(): 11216 11217 # Key 11218 key = row.iloc[0] 11219 key_clean = key 11220 11221 # key rename 11222 if column_rename: 11223 key_clean = column_rename.get(key_clean, key_clean) 11224 11225 # key clean 11226 if column_clean: 11227 key_clean = clean_annotation_field(key_clean) 11228 11229 # Key case 11230 if column_case: 11231 if column_case.lower() in ["lower"]: 11232 key_clean = key_clean.lower() 11233 elif column_case.lower() in ["upper"]: 11234 key_clean = key_clean.upper() 11235 11236 # Type 11237 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11238 11239 # Get DataFrame from query 11240 df_json_type = self.get_query_to_df(query=query_json_type) 11241 11242 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11243 with pd.option_context("future.no_silent_downcasting", True): 11244 df_json_type.fillna(value="", inplace=True) 11245 replace_dict = {None: np.nan, "": np.nan} 11246 df_json_type.replace(replace_dict, inplace=True) 11247 df_json_type.dropna(inplace=True) 11248 11249 # Detect column type 11250 column_type = detect_column_type(df_json_type[key_clean]) 11251 11252 # Append 11253 query_json_key.append( 11254 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11255 ) 11256 11257 # Create view 11258 query_view = f""" 11259 CREATE TEMPORARY TABLE {view_name} 11260 AS ( 11261 SELECT *, {annotation_id} AS 'transcript' 11262 FROM ( 11263 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11264 FROM dataframe_annotation_format 11265 ) 11266 ); 11267 """ 11268 self.execute_query(query=query_view) 11269 11270 else: 11271 11272 # Return None 11273 view_name = None 11274 11275 # Remove added columns 11276 for added_column in added_columns: 11277 self.drop_column(column=added_column) 11278 11279 return view_name 11280 11281 def transcript_view_to_variants( 11282 self, 11283 transcripts_table: str = None, 11284 transcripts_column_id: str = None, 11285 transcripts_info_json: str = None, 11286 transcripts_info_field_json: str = None, 11287 transcripts_info_format: str = None, 11288 transcripts_info_field_format: str = None, 11289 param: dict = {}, 11290 ) -> bool: 11291 """ 11292 The `transcript_view_to_variants` function updates a variants table with information from 11293 transcripts in JSON format. 11294 11295 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11296 table containing the transcripts data. If this parameter is not provided, the function will 11297 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11298 :type transcripts_table: str 11299 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11300 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11301 identifier is used to match transcripts with variants in the database 11302 :type transcripts_column_id: str 11303 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11304 of the column in the variants table where the transcripts information will be stored in JSON 11305 format. This parameter allows you to define the column in the variants table that will hold the 11306 JSON-formatted information about transcripts 11307 :type transcripts_info_json: str 11308 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11309 specify the field in the VCF header that will contain information about transcripts in JSON 11310 format. This field will be added to the VCF header as an INFO field with the specified name 11311 :type transcripts_info_field_json: str 11312 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11313 format of the information about transcripts that will be stored in the variants table. This 11314 format can be used to define how the transcript information will be structured or displayed 11315 within the variants table 11316 :type transcripts_info_format: str 11317 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11318 specify the field in the VCF header that will contain information about transcripts in a 11319 specific format. This field will be added to the VCF header as an INFO field with the specified 11320 name 11321 :type transcripts_info_field_format: str 11322 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11323 that contains various configuration settings related to transcripts. It is used to provide 11324 default values for certain parameters if they are not explicitly provided when calling the 11325 method. The `param` dictionary can be passed as an argument 11326 :type param: dict 11327 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11328 if the operation is successful and `False` if certain conditions are not met. 11329 """ 11330 11331 msg_info_prefix = "Start transcripts view to variants annotations" 11332 11333 log.debug(f"{msg_info_prefix}...") 11334 11335 # Default 11336 transcripts_table_default = "transcripts" 11337 transcripts_column_id_default = "transcript" 11338 transcripts_info_json_default = None 11339 transcripts_info_format_default = None 11340 transcripts_info_field_json_default = None 11341 transcripts_info_field_format_default = None 11342 11343 # Param 11344 if not param: 11345 param = self.get_param() 11346 11347 # Transcripts table 11348 if transcripts_table is None: 11349 transcripts_table = param.get("transcripts", {}).get( 11350 "table", transcripts_table_default 11351 ) 11352 11353 # Transcripts column ID 11354 if transcripts_column_id is None: 11355 transcripts_column_id = param.get("transcripts", {}).get( 11356 "column_id", transcripts_column_id_default 11357 ) 11358 11359 # Transcripts info json 11360 if transcripts_info_json is None: 11361 transcripts_info_json = param.get("transcripts", {}).get( 11362 "transcripts_info_json", transcripts_info_json_default 11363 ) 11364 11365 # Transcripts info field JSON 11366 if transcripts_info_field_json is None: 11367 transcripts_info_field_json = param.get("transcripts", {}).get( 11368 "transcripts_info_field_json", transcripts_info_field_json_default 11369 ) 11370 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11371 # transcripts_info_json = transcripts_info_field_json 11372 11373 # Transcripts info format 11374 if transcripts_info_format is None: 11375 transcripts_info_format = param.get("transcripts", {}).get( 11376 "transcripts_info_format", transcripts_info_format_default 11377 ) 11378 11379 # Transcripts info field FORMAT 11380 if transcripts_info_field_format is None: 11381 transcripts_info_field_format = param.get("transcripts", {}).get( 11382 "transcripts_info_field_format", transcripts_info_field_format_default 11383 ) 11384 # if ( 11385 # transcripts_info_field_format is not None 11386 # and transcripts_info_format is None 11387 # ): 11388 # transcripts_info_format = transcripts_info_field_format 11389 11390 # Variants table 11391 table_variants = self.get_table_variants() 11392 11393 # Check info columns param 11394 if ( 11395 transcripts_info_json is None 11396 and transcripts_info_field_json is None 11397 and transcripts_info_format is None 11398 and transcripts_info_field_format is None 11399 ): 11400 return False 11401 11402 # Transcripts infos columns 11403 query_transcripts_infos_columns = f""" 11404 SELECT * 11405 FROM ( 11406 DESCRIBE SELECT * FROM {transcripts_table} 11407 ) 11408 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11409 """ 11410 transcripts_infos_columns = list( 11411 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11412 ) 11413 11414 # View results 11415 clause_select = [] 11416 clause_to_json = [] 11417 clause_to_format = [] 11418 for field in transcripts_infos_columns: 11419 # Do not consider INFO field for export into fields 11420 if field not in ["INFO"]: 11421 clause_select.append( 11422 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11423 ) 11424 clause_to_json.append(f""" '{field}': "{field}" """) 11425 clause_to_format.append(f""" "{field}" """) 11426 11427 # Update 11428 update_set_json = [] 11429 update_set_format = [] 11430 11431 # VCF header 11432 vcf_reader = self.get_header() 11433 11434 # Transcripts to info column in JSON 11435 if transcripts_info_json: 11436 11437 # Create column on variants table 11438 self.add_column( 11439 table_name=table_variants, 11440 column_name=transcripts_info_json, 11441 column_type="JSON", 11442 default_value=None, 11443 drop=False, 11444 ) 11445 11446 # Add header 11447 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11448 transcripts_info_json, 11449 ".", 11450 "String", 11451 "Transcripts in JSON format", 11452 "unknwon", 11453 "unknwon", 11454 self.code_type_map["String"], 11455 ) 11456 11457 # Add to update 11458 update_set_json.append( 11459 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11460 ) 11461 11462 # Transcripts to info field in JSON 11463 if transcripts_info_field_json: 11464 11465 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11466 11467 # Add to update 11468 update_set_json.append( 11469 f""" 11470 INFO = concat( 11471 CASE 11472 WHEN INFO NOT IN ('', '.') 11473 THEN INFO 11474 ELSE '' 11475 END, 11476 CASE 11477 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11478 THEN concat( 11479 ';{transcripts_info_field_json}=', 11480 t.{transcripts_info_json} 11481 ) 11482 ELSE '' 11483 END 11484 ) 11485 """ 11486 ) 11487 11488 # Add header 11489 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11490 transcripts_info_field_json, 11491 ".", 11492 "String", 11493 "Transcripts in JSON format", 11494 "unknwon", 11495 "unknwon", 11496 self.code_type_map["String"], 11497 ) 11498 11499 if update_set_json: 11500 11501 # Update query 11502 query_update = f""" 11503 UPDATE {table_variants} 11504 SET {", ".join(update_set_json)} 11505 FROM 11506 ( 11507 SELECT 11508 "#CHROM", POS, REF, ALT, 11509 concat( 11510 '{{', 11511 string_agg( 11512 '"' || "{transcripts_column_id}" || '":' || 11513 to_json(json_output) 11514 ), 11515 '}}' 11516 )::JSON AS {transcripts_info_json} 11517 FROM 11518 ( 11519 SELECT 11520 "#CHROM", POS, REF, ALT, 11521 "{transcripts_column_id}", 11522 to_json( 11523 {{{",".join(clause_to_json)}}} 11524 )::JSON AS json_output 11525 FROM 11526 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11527 WHERE "{transcripts_column_id}" IS NOT NULL 11528 ) 11529 GROUP BY "#CHROM", POS, REF, ALT 11530 ) AS t 11531 WHERE {table_variants}."#CHROM" = t."#CHROM" 11532 AND {table_variants}."POS" = t."POS" 11533 AND {table_variants}."REF" = t."REF" 11534 AND {table_variants}."ALT" = t."ALT" 11535 """ 11536 11537 self.execute_query(query=query_update) 11538 11539 # Transcripts to info column in FORMAT 11540 if transcripts_info_format: 11541 11542 # Create column on variants table 11543 self.add_column( 11544 table_name=table_variants, 11545 column_name=transcripts_info_format, 11546 column_type="VARCHAR", 11547 default_value=None, 11548 drop=False, 11549 ) 11550 11551 # Add header 11552 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11553 transcripts_info_format, 11554 ".", 11555 "String", 11556 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11557 "unknwon", 11558 "unknwon", 11559 self.code_type_map["String"], 11560 ) 11561 11562 # Add to update 11563 update_set_format.append( 11564 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11565 ) 11566 11567 else: 11568 11569 # Set variable for internal queries 11570 transcripts_info_format = "transcripts_info_format" 11571 11572 # Transcripts to info field in JSON 11573 if transcripts_info_field_format: 11574 11575 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11576 11577 # Add to update 11578 update_set_format.append( 11579 f""" 11580 INFO = concat( 11581 CASE 11582 WHEN INFO NOT IN ('', '.') 11583 THEN INFO 11584 ELSE '' 11585 END, 11586 CASE 11587 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11588 THEN concat( 11589 ';{transcripts_info_field_format}=', 11590 t.{transcripts_info_format} 11591 ) 11592 ELSE '' 11593 END 11594 ) 11595 """ 11596 ) 11597 11598 # Add header 11599 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11600 transcripts_info_field_format, 11601 ".", 11602 "String", 11603 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11604 "unknwon", 11605 "unknwon", 11606 self.code_type_map["String"], 11607 ) 11608 11609 if update_set_format: 11610 11611 # Update query 11612 query_update = f""" 11613 UPDATE {table_variants} 11614 SET {", ".join(update_set_format)} 11615 FROM 11616 ( 11617 SELECT 11618 "#CHROM", POS, REF, ALT, 11619 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11620 FROM 11621 ( 11622 SELECT 11623 "#CHROM", POS, REF, ALT, 11624 "{transcripts_column_id}", 11625 concat( 11626 "{transcripts_column_id}", 11627 '|', 11628 {", '|', ".join(clause_to_format)} 11629 ) AS {transcripts_info_format} 11630 FROM 11631 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11632 ) 11633 GROUP BY "#CHROM", POS, REF, ALT 11634 ) AS t 11635 WHERE {table_variants}."#CHROM" = t."#CHROM" 11636 AND {table_variants}."POS" = t."POS" 11637 AND {table_variants}."REF" = t."REF" 11638 AND {table_variants}."ALT" = t."ALT" 11639 """ 11640 11641 self.execute_query(query=query_update) 11642 11643 return True 11644 11645 def rename_info_fields( 11646 self, fields_to_rename: dict = None, table: str = None 11647 ) -> dict: 11648 """ 11649 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11650 corresponding INFO fields in the variants table. 11651 11652 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11653 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11654 represent the original field names that need to be renamed, and the corresponding values 11655 represent the new names to which the fields should be 11656 :type fields_to_rename: dict 11657 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11658 the table in which the variants data is stored. This table contains information about genetic 11659 variants, and the function updates the corresponding INFO fields in this table when renaming 11660 specified fields in the VCF file header 11661 :type table: str 11662 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11663 the original field names as keys and their corresponding new names (or None if the field was 11664 removed) as values after renaming or removing specified fields in a VCF file header and updating 11665 corresponding INFO fields in the variants table. 11666 """ 11667 11668 # Init 11669 fields_renamed = {} 11670 config = self.get_config() 11671 access = config.get("access") 11672 11673 if table is None: 11674 table = self.get_table_variants() 11675 11676 if fields_to_rename is not None and access not in ["RO"]: 11677 11678 log.info("Rename or remove fields...") 11679 11680 # Header 11681 header = self.get_header() 11682 11683 for field_to_rename, field_renamed in fields_to_rename.items(): 11684 11685 if field_to_rename in header.infos: 11686 11687 # Rename header 11688 if field_renamed is not None: 11689 header.infos[field_renamed] = vcf.parser._Info( 11690 field_renamed, 11691 header.infos[field_to_rename].num, 11692 header.infos[field_to_rename].type, 11693 header.infos[field_to_rename].desc, 11694 header.infos[field_to_rename].source, 11695 header.infos[field_to_rename].version, 11696 header.infos[field_to_rename].type_code, 11697 ) 11698 del header.infos[field_to_rename] 11699 11700 # Rename INFO patterns 11701 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11702 if field_renamed is not None: 11703 field_renamed_pattern = rf'\1{field_renamed}=\3' 11704 else: 11705 field_renamed_pattern = '' 11706 11707 # Rename INFO 11708 query = f""" 11709 UPDATE {table} 11710 SET 11711 INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g') 11712 """ 11713 self.execute_query(query=query) 11714 11715 # Return 11716 fields_renamed[field_to_rename] = field_renamed 11717 11718 # Log 11719 if field_renamed is not None: 11720 log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'") 11721 else: 11722 log.info(f"Rename or remove fields: field '{field_to_rename}' removed") 11723 11724 return fields_renamed 11725 11726 def calculation_rename_info_fields( 11727 self, 11728 fields_to_rename: dict = None, 11729 table: str = None, 11730 operation_name: str = "RENAME_INFO_FIELDS", 11731 ) -> None: 11732 """ 11733 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11734 fields to rename and table if provided, and then calls another function to rename the fields. 11735 11736 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11737 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11738 the key and the new field name as the value 11739 :type fields_to_rename: dict 11740 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11741 specify the name of the table for which the fields are to be renamed. It is a string type 11742 parameter 11743 :type table: str 11744 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11745 method is a string that specifies the name of the operation being performed. In this context, it 11746 is used as a default value for the operation name if not explicitly provided when calling the 11747 function, defaults to RENAME_INFO_FIELDS 11748 :type operation_name: str (optional) 11749 """ 11750 11751 # Param 11752 param = self.get_param() 11753 11754 # Get param fields to rename 11755 param_fields_to_rename = ( 11756 param.get("calculation", {}) 11757 .get("calculations", {}) 11758 .get(operation_name, {}) 11759 .get("fields_to_rename", None) 11760 ) 11761 11762 # Get param table 11763 param_table = ( 11764 param.get("calculation", {}) 11765 .get("calculations", {}) 11766 .get(operation_name, {}) 11767 .get("table", None) 11768 ) 11769 11770 # Init fields_to_rename 11771 if fields_to_rename is None: 11772 fields_to_rename = param_fields_to_rename 11773 11774 # Init table 11775 if table is None: 11776 table = param_table 11777 11778 renamed_fields = self.rename_info_fields( 11779 fields_to_rename=fields_to_rename, table=table 11780 ) 11781 11782 log.debug(f"renamed_fields:{renamed_fields}")
36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 ) 2303 2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns 2335 2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 ) 2350 2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name 2445 2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2518 else: 2519 samples_fields = "" 2520 log.debug(f"samples_fields: {samples_fields}") 2521 else: 2522 samples_fields = "" 2523 2524 # Where clause 2525 if where_clause is None: 2526 where_clause = "" 2527 2528 # Variants 2529 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2530 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2531 log.debug(f"sql_query_select={sql_query_select}") 2532 2533 return self.export_output( 2534 output_file=vcf_file, 2535 output_header=None, 2536 export_header=True, 2537 query=sql_query_select, 2538 parquet_partitions=None, 2539 chunk_size=config.get("chunk_size", None), 2540 threads=threads, 2541 sort=True, 2542 index=index, 2543 order_by=None, 2544 ) 2545 2546 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2547 """ 2548 It takes a list of commands and runs them in parallel using the number of threads specified 2549 2550 :param commands: A list of commands to run 2551 :param threads: The number of threads to use, defaults to 1 (optional) 2552 """ 2553 2554 run_parallel_commands(commands, threads) 2555 2556 def get_threads(self, default: int = 1) -> int: 2557 """ 2558 This function returns the number of threads to use for a job, with a default value of 1 if not 2559 specified. 2560 2561 :param default: The `default` parameter in the `get_threads` method is used to specify the 2562 default number of threads to use if no specific value is provided. If no value is provided for 2563 the `threads` parameter in the configuration or input parameters, the `default` value will be 2564 used, defaults to 1 2565 :type default: int (optional) 2566 :return: the number of threads to use for the current job. 2567 """ 2568 2569 # Config 2570 config = self.get_config() 2571 2572 # Param 2573 param = self.get_param() 2574 2575 # Input threads 2576 input_thread = param.get("threads", config.get("threads", None)) 2577 2578 # Check threads 2579 if not input_thread: 2580 threads = default 2581 elif int(input_thread) <= 0: 2582 threads = os.cpu_count() 2583 else: 2584 threads = int(input_thread) 2585 return threads 2586 2587 def get_memory(self, default: str = None) -> str: 2588 """ 2589 This function retrieves the memory value from parameters or configuration with a default value 2590 if not found. 2591 2592 :param default: The `get_memory` function takes in a default value as a string parameter. This 2593 default value is used as a fallback in case the `memory` parameter is not provided in the 2594 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2595 the function 2596 :type default: str 2597 :return: The `get_memory` function returns a string value representing the memory parameter. If 2598 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2599 return the default value provided as an argument to the function. 2600 """ 2601 2602 # Config 2603 config = self.get_config() 2604 2605 # Param 2606 param = self.get_param() 2607 2608 # Input threads 2609 input_memory = param.get("memory", config.get("memory", None)) 2610 2611 # Check threads 2612 if input_memory: 2613 memory = input_memory 2614 else: 2615 memory = default 2616 2617 return memory 2618 2619 def update_from_vcf(self, vcf_file: str) -> None: 2620 """ 2621 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2622 2623 :param vcf_file: the path to the VCF file 2624 """ 2625 2626 connexion_format = self.get_connexion_format() 2627 2628 if connexion_format in ["duckdb"]: 2629 self.update_from_vcf_duckdb(vcf_file) 2630 elif connexion_format in ["sqlite"]: 2631 self.update_from_vcf_sqlite(vcf_file) 2632 2633 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2634 """ 2635 It takes a VCF file and updates the INFO column of the variants table in the database with the 2636 INFO column of the VCF file 2637 2638 :param vcf_file: the path to the VCF file 2639 """ 2640 2641 # varaints table 2642 table_variants = self.get_table_variants() 2643 2644 # Loading VCF into temporaire table 2645 skip = self.get_header_length(file=vcf_file) 2646 vcf_df = pd.read_csv( 2647 vcf_file, 2648 sep="\t", 2649 engine="c", 2650 skiprows=skip, 2651 header=0, 2652 low_memory=False, 2653 ) 2654 sql_query_update = f""" 2655 UPDATE {table_variants} as table_variants 2656 SET INFO = concat( 2657 CASE 2658 WHEN INFO NOT IN ('', '.') 2659 THEN INFO 2660 ELSE '' 2661 END, 2662 ( 2663 SELECT 2664 concat( 2665 CASE 2666 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2667 THEN ';' 2668 ELSE '' 2669 END 2670 , 2671 CASE 2672 WHEN table_parquet.INFO NOT IN ('','.') 2673 THEN table_parquet.INFO 2674 ELSE '' 2675 END 2676 ) 2677 FROM vcf_df as table_parquet 2678 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2679 AND table_parquet.\"POS\" = table_variants.\"POS\" 2680 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2681 AND table_parquet.\"REF\" = table_variants.\"REF\" 2682 AND table_parquet.INFO NOT IN ('','.') 2683 ) 2684 ) 2685 ; 2686 """ 2687 self.conn.execute(sql_query_update) 2688 2689 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2690 """ 2691 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2692 table, then updates the INFO column of the variants table with the INFO column of the temporary 2693 table 2694 2695 :param vcf_file: The path to the VCF file you want to update the database with 2696 """ 2697 2698 # Create a temporary table for the VCF 2699 table_vcf = "tmp_vcf" 2700 sql_create = ( 2701 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2702 ) 2703 self.conn.execute(sql_create) 2704 2705 # Loading VCF into temporaire table 2706 vcf_df = pd.read_csv( 2707 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2708 ) 2709 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2710 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2711 2712 # Update table 'variants' with VCF data 2713 # warning: CONCAT as || operator 2714 sql_query_update = f""" 2715 UPDATE variants as table_variants 2716 SET INFO = CASE 2717 WHEN INFO NOT IN ('', '.') 2718 THEN INFO 2719 ELSE '' 2720 END || 2721 ( 2722 SELECT 2723 CASE 2724 WHEN table_variants.INFO NOT IN ('','.') 2725 AND table_vcf.INFO NOT IN ('','.') 2726 THEN ';' 2727 ELSE '' 2728 END || 2729 CASE 2730 WHEN table_vcf.INFO NOT IN ('','.') 2731 THEN table_vcf.INFO 2732 ELSE '' 2733 END 2734 FROM {table_vcf} as table_vcf 2735 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2736 AND table_vcf.\"POS\" = table_variants.\"POS\" 2737 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2738 AND table_vcf.\"REF\" = table_variants.\"REF\" 2739 ) 2740 """ 2741 self.conn.execute(sql_query_update) 2742 2743 # Drop temporary table 2744 sql_drop = f"DROP TABLE {table_vcf}" 2745 self.conn.execute(sql_drop) 2746 2747 def drop_variants_table(self) -> None: 2748 """ 2749 > This function drops the variants table 2750 """ 2751 2752 table_variants = self.get_table_variants() 2753 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2754 self.conn.execute(sql_table_variants) 2755 2756 def set_variant_id( 2757 self, variant_id_column: str = "variant_id", force: bool = None 2758 ) -> str: 2759 """ 2760 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2761 `#CHROM`, `POS`, `REF`, and `ALT` columns 2762 2763 :param variant_id_column: The name of the column to be created in the variants table, defaults 2764 to variant_id 2765 :type variant_id_column: str (optional) 2766 :param force: If True, the variant_id column will be created even if it already exists 2767 :type force: bool 2768 :return: The name of the column that contains the variant_id 2769 """ 2770 2771 # Assembly 2772 assembly = self.get_param().get( 2773 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2774 ) 2775 2776 # INFO/Tag prefix 2777 prefix = self.get_explode_infos_prefix() 2778 2779 # Explode INFO/SVTYPE 2780 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2781 2782 # variants table 2783 table_variants = self.get_table_variants() 2784 2785 # variant_id column 2786 if not variant_id_column: 2787 variant_id_column = "variant_id" 2788 2789 # Creta variant_id column 2790 if "variant_id" not in self.get_extra_infos() or force: 2791 2792 # Create column 2793 self.add_column( 2794 table_name=table_variants, 2795 column_name=variant_id_column, 2796 column_type="UBIGINT", 2797 default_value="0", 2798 ) 2799 2800 # Update column 2801 self.conn.execute( 2802 f""" 2803 UPDATE {table_variants} 2804 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2805 """ 2806 ) 2807 2808 # Remove added columns 2809 for added_column in added_columns: 2810 self.drop_column(column=added_column) 2811 2812 # return variant_id column name 2813 return variant_id_column 2814 2815 def get_variant_id_column( 2816 self, variant_id_column: str = "variant_id", force: bool = None 2817 ) -> str: 2818 """ 2819 This function returns the variant_id column name 2820 2821 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2822 defaults to variant_id 2823 :type variant_id_column: str (optional) 2824 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2825 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2826 if it is not already set, or if it is set 2827 :type force: bool 2828 :return: The variant_id column name. 2829 """ 2830 2831 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2832 2833 ### 2834 # Annotation 2835 ### 2836 2837 def scan_databases( 2838 self, 2839 database_formats: list = ["parquet"], 2840 database_releases: list = ["current"], 2841 ) -> dict: 2842 """ 2843 The function `scan_databases` scans for available databases based on specified formats and 2844 releases. 2845 2846 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2847 of the databases to be scanned. In this case, the accepted format is "parquet" 2848 :type database_formats: list ["parquet"] 2849 :param database_releases: The `database_releases` parameter is a list that specifies the 2850 releases of the databases to be scanned. In the provided function, the default value for 2851 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2852 databases that are in the "current" 2853 :type database_releases: list 2854 :return: The function `scan_databases` returns a dictionary containing information about 2855 databases that match the specified formats and releases. 2856 """ 2857 2858 # Config 2859 config = self.get_config() 2860 2861 # Param 2862 param = self.get_param() 2863 2864 # Param - Assembly 2865 assembly = param.get("assembly", config.get("assembly", None)) 2866 if not assembly: 2867 assembly = DEFAULT_ASSEMBLY 2868 log.warning(f"Default assembly '{assembly}'") 2869 2870 # Scan for availabled databases 2871 log.info( 2872 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2873 ) 2874 databases_infos_dict = databases_infos( 2875 database_folder_releases=database_releases, 2876 database_formats=database_formats, 2877 assembly=assembly, 2878 config=config, 2879 ) 2880 log.info( 2881 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2882 ) 2883 2884 return databases_infos_dict 2885 2886 def annotation(self) -> None: 2887 """ 2888 It annotates the VCF file with the annotations specified in the config file. 2889 """ 2890 2891 # Config 2892 config = self.get_config() 2893 2894 # Param 2895 param = self.get_param() 2896 2897 # Param - Assembly 2898 assembly = param.get("assembly", config.get("assembly", None)) 2899 if not assembly: 2900 assembly = DEFAULT_ASSEMBLY 2901 log.warning(f"Default assembly '{assembly}'") 2902 2903 # annotations databases folders 2904 annotations_databases = set( 2905 config.get("folders", {}) 2906 .get("databases", {}) 2907 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2908 + config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("parquet", ["~/howard/databases/parquet/current"]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2914 ) 2915 2916 # Get param annotations 2917 if param.get("annotations", None) and isinstance( 2918 param.get("annotations", None), str 2919 ): 2920 log.debug(param.get("annotations", None)) 2921 param_annotation_list = param.get("annotations").split(",") 2922 else: 2923 param_annotation_list = [] 2924 2925 # Each tools param 2926 if param.get("annotation_parquet", None) != None: 2927 log.debug( 2928 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2929 ) 2930 if isinstance(param.get("annotation_parquet", None), list): 2931 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2932 else: 2933 param_annotation_list.append(param.get("annotation_parquet")) 2934 if param.get("annotation_snpsift", None) != None: 2935 if isinstance(param.get("annotation_snpsift", None), list): 2936 param_annotation_list.append( 2937 "snpsift:" 2938 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2939 ) 2940 else: 2941 param_annotation_list.append( 2942 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2943 ) 2944 if param.get("annotation_snpeff", None) != None: 2945 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2946 if param.get("annotation_bcftools", None) != None: 2947 if isinstance(param.get("annotation_bcftools", None), list): 2948 param_annotation_list.append( 2949 "bcftools:" 2950 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2951 ) 2952 else: 2953 param_annotation_list.append( 2954 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2955 ) 2956 if param.get("annotation_annovar", None) != None: 2957 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2958 if param.get("annotation_exomiser", None) != None: 2959 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2960 if param.get("annotation_splice", None) != None: 2961 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2962 2963 # Merge param annotations list 2964 param["annotations"] = ",".join(param_annotation_list) 2965 2966 # debug 2967 log.debug(f"param_annotations={param['annotations']}") 2968 2969 if param.get("annotations"): 2970 2971 # Log 2972 # log.info("Annotations - Check annotation parameters") 2973 2974 if not "annotation" in param: 2975 param["annotation"] = {} 2976 2977 # List of annotations parameters 2978 annotations_list_input = {} 2979 if isinstance(param.get("annotations", None), str): 2980 annotation_file_list = [ 2981 value for value in param.get("annotations", "").split(",") 2982 ] 2983 for annotation_file in annotation_file_list: 2984 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2985 else: 2986 annotations_list_input = param.get("annotations", {}) 2987 2988 log.info(f"Quick Annotations:") 2989 for annotation_key in list(annotations_list_input.keys()): 2990 log.info(f" {annotation_key}") 2991 2992 # List of annotations and associated fields 2993 annotations_list = {} 2994 2995 for annotation_file in annotations_list_input: 2996 2997 # Explode annotations if ALL 2998 if ( 2999 annotation_file.upper() == "ALL" 3000 or annotation_file.upper().startswith("ALL:") 3001 ): 3002 3003 # check ALL parameters (formats, releases) 3004 annotation_file_split = annotation_file.split(":") 3005 database_formats = "parquet" 3006 database_releases = "current" 3007 for annotation_file_option in annotation_file_split[1:]: 3008 database_all_options_split = annotation_file_option.split("=") 3009 if database_all_options_split[0] == "format": 3010 database_formats = database_all_options_split[1].split("+") 3011 if database_all_options_split[0] == "release": 3012 database_releases = database_all_options_split[1].split("+") 3013 3014 # Scan for availabled databases 3015 databases_infos_dict = self.scan_databases( 3016 database_formats=database_formats, 3017 database_releases=database_releases, 3018 ) 3019 3020 # Add found databases in annotation parameters 3021 for database_infos in databases_infos_dict.keys(): 3022 annotations_list[database_infos] = {"INFO": None} 3023 3024 else: 3025 annotations_list[annotation_file] = annotations_list_input[ 3026 annotation_file 3027 ] 3028 3029 # Check each databases 3030 if len(annotations_list): 3031 3032 log.info( 3033 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3034 ) 3035 3036 for annotation_file in annotations_list: 3037 3038 # Init 3039 annotations = annotations_list.get(annotation_file, None) 3040 3041 # Annotation snpEff 3042 if annotation_file.startswith("snpeff"): 3043 3044 log.debug(f"Quick Annotation snpEff") 3045 3046 if "snpeff" not in param["annotation"]: 3047 param["annotation"]["snpeff"] = {} 3048 3049 if "options" not in param["annotation"]["snpeff"]: 3050 param["annotation"]["snpeff"]["options"] = "" 3051 3052 # snpEff options in annotations 3053 param["annotation"]["snpeff"]["options"] = "".join( 3054 annotation_file.split(":")[1:] 3055 ) 3056 3057 # Annotation Annovar 3058 elif annotation_file.startswith("annovar"): 3059 3060 log.debug(f"Quick Annotation Annovar") 3061 3062 if "annovar" not in param["annotation"]: 3063 param["annotation"]["annovar"] = {} 3064 3065 if "annotations" not in param["annotation"]["annovar"]: 3066 param["annotation"]["annovar"]["annotations"] = {} 3067 3068 # Options 3069 annotation_file_split = annotation_file.split(":") 3070 for annotation_file_annotation in annotation_file_split[1:]: 3071 if annotation_file_annotation: 3072 param["annotation"]["annovar"]["annotations"][ 3073 annotation_file_annotation 3074 ] = annotations 3075 3076 # Annotation Exomiser 3077 elif annotation_file.startswith("exomiser"): 3078 3079 log.debug(f"Quick Annotation Exomiser") 3080 3081 param["annotation"]["exomiser"] = params_string_to_dict( 3082 annotation_file 3083 ) 3084 3085 # Annotation Splice 3086 elif annotation_file.startswith("splice"): 3087 3088 log.debug(f"Quick Annotation Splice") 3089 3090 param["annotation"]["splice"] = params_string_to_dict( 3091 annotation_file 3092 ) 3093 3094 # Annotation Parquet or BCFTOOLS 3095 else: 3096 3097 # Tools detection 3098 if annotation_file.startswith("bcftools:"): 3099 annotation_tool_initial = "bcftools" 3100 annotation_file = ":".join(annotation_file.split(":")[1:]) 3101 elif annotation_file.startswith("snpsift:"): 3102 annotation_tool_initial = "snpsift" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("bigwig:"): 3105 annotation_tool_initial = "bigwig" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 else: 3108 annotation_tool_initial = None 3109 3110 # list of files 3111 annotation_file_list = annotation_file.replace("+", ":").split( 3112 ":" 3113 ) 3114 3115 for annotation_file in annotation_file_list: 3116 3117 if annotation_file: 3118 3119 # Annotation tool initial 3120 annotation_tool = annotation_tool_initial 3121 3122 # Find file 3123 annotation_file_found = None 3124 3125 if os.path.exists(annotation_file): 3126 annotation_file_found = annotation_file 3127 elif os.path.exists(full_path(annotation_file)): 3128 annotation_file_found = full_path(annotation_file) 3129 else: 3130 # Find within assembly folders 3131 for annotations_database in annotations_databases: 3132 found_files = find_all( 3133 annotation_file, 3134 os.path.join( 3135 annotations_database, assembly 3136 ), 3137 ) 3138 if len(found_files) > 0: 3139 annotation_file_found = found_files[0] 3140 break 3141 if not annotation_file_found and not assembly: 3142 # Find within folders 3143 for ( 3144 annotations_database 3145 ) in annotations_databases: 3146 found_files = find_all( 3147 annotation_file, annotations_database 3148 ) 3149 if len(found_files) > 0: 3150 annotation_file_found = found_files[0] 3151 break 3152 log.debug( 3153 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3154 ) 3155 3156 # Full path 3157 annotation_file_found = full_path(annotation_file_found) 3158 3159 if annotation_file_found: 3160 3161 database = Database(database=annotation_file_found) 3162 quick_annotation_format = database.get_format() 3163 quick_annotation_is_compressed = ( 3164 database.is_compressed() 3165 ) 3166 quick_annotation_is_indexed = os.path.exists( 3167 f"{annotation_file_found}.tbi" 3168 ) 3169 bcftools_preference = False 3170 3171 # Check Annotation Tool 3172 if not annotation_tool: 3173 if ( 3174 bcftools_preference 3175 and quick_annotation_format 3176 in ["vcf", "bed"] 3177 and quick_annotation_is_compressed 3178 and quick_annotation_is_indexed 3179 ): 3180 annotation_tool = "bcftools" 3181 elif quick_annotation_format in [ 3182 "vcf", 3183 "bed", 3184 "tsv", 3185 "tsv", 3186 "csv", 3187 "json", 3188 "tbl", 3189 "parquet", 3190 "duckdb", 3191 ]: 3192 annotation_tool = "parquet" 3193 elif quick_annotation_format in ["bw"]: 3194 annotation_tool = "bigwig" 3195 else: 3196 log.error( 3197 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3198 ) 3199 raise ValueError( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 3203 log.debug( 3204 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3205 ) 3206 3207 # Annotation Tool dispatch 3208 if annotation_tool: 3209 if annotation_tool not in param["annotation"]: 3210 param["annotation"][annotation_tool] = {} 3211 if ( 3212 "annotations" 3213 not in param["annotation"][annotation_tool] 3214 ): 3215 param["annotation"][annotation_tool][ 3216 "annotations" 3217 ] = {} 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ][annotation_file_found] = annotations 3221 3222 else: 3223 log.warning( 3224 f"Quick Annotation File {annotation_file} does NOT exist" 3225 ) 3226 3227 self.set_param(param) 3228 3229 if param.get("annotation", None): 3230 log.info("Annotations") 3231 if param.get("annotation", {}).get("parquet", None): 3232 log.info("Annotations 'parquet'...") 3233 self.annotation_parquet() 3234 if param.get("annotation", {}).get("bcftools", None): 3235 log.info("Annotations 'bcftools'...") 3236 self.annotation_bcftools() 3237 if param.get("annotation", {}).get("snpsift", None): 3238 log.info("Annotations 'snpsift'...") 3239 self.annotation_snpsift() 3240 if param.get("annotation", {}).get("bigwig", None): 3241 log.info("Annotations 'bigwig'...") 3242 self.annotation_bigwig() 3243 if param.get("annotation", {}).get("annovar", None): 3244 log.info("Annotations 'annovar'...") 3245 self.annotation_annovar() 3246 if param.get("annotation", {}).get("snpeff", None): 3247 log.info("Annotations 'snpeff'...") 3248 self.annotation_snpeff() 3249 if param.get("annotation", {}).get("exomiser", None) is not None: 3250 log.info("Annotations 'exomiser'...") 3251 self.annotation_exomiser() 3252 if param.get("annotation", {}).get("splice", None) is not None: 3253 log.info("Annotations 'splice' ...") 3254 self.annotation_splice() 3255 3256 # Explode INFOS fields into table fields 3257 if self.get_explode_infos(): 3258 self.explode_infos( 3259 prefix=self.get_explode_infos_prefix(), 3260 fields=self.get_explode_infos_fields(), 3261 force=True, 3262 ) 3263 3264 def annotation_bigwig(self, threads: int = None) -> None: 3265 """ 3266 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3267 3268 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3269 number of threads to be used for parallel processing during the annotation process. If the 3270 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3271 threads to use based on the system configuration 3272 :type threads: int 3273 :return: True 3274 """ 3275 3276 # DEBUG 3277 log.debug("Start annotation with bigwig databases") 3278 3279 # # Threads 3280 # if not threads: 3281 # threads = self.get_threads() 3282 # log.debug("Threads: " + str(threads)) 3283 3284 # Config 3285 config = self.get_config() 3286 log.debug("Config: " + str(config)) 3287 3288 # Config - BCFTools databases folders 3289 databases_folders = set( 3290 self.get_config() 3291 .get("folders", {}) 3292 .get("databases", {}) 3293 .get("annotations", ["."]) 3294 + self.get_config() 3295 .get("folders", {}) 3296 .get("databases", {}) 3297 .get("bigwig", ["."]) 3298 ) 3299 log.debug("Databases annotations: " + str(databases_folders)) 3300 3301 # Param 3302 annotations = ( 3303 self.get_param() 3304 .get("annotation", {}) 3305 .get("bigwig", {}) 3306 .get("annotations", None) 3307 ) 3308 log.debug("Annotations: " + str(annotations)) 3309 3310 # Assembly 3311 assembly = self.get_param().get( 3312 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3313 ) 3314 3315 # Data 3316 table_variants = self.get_table_variants() 3317 3318 # Check if not empty 3319 log.debug("Check if not empty") 3320 sql_query_chromosomes = ( 3321 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3322 ) 3323 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3324 if not sql_query_chromosomes_df["count"][0]: 3325 log.info(f"VCF empty") 3326 return 3327 3328 # VCF header 3329 vcf_reader = self.get_header() 3330 log.debug("Initial header: " + str(vcf_reader.infos)) 3331 3332 # Existing annotations 3333 for vcf_annotation in self.get_header().infos: 3334 3335 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3336 log.debug( 3337 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3338 ) 3339 3340 if annotations: 3341 3342 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3343 3344 # Export VCF file 3345 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3346 3347 # annotation_bigwig_config 3348 annotation_bigwig_config_list = [] 3349 3350 for annotation in annotations: 3351 annotation_fields = annotations[annotation] 3352 3353 # Annotation Name 3354 annotation_name = os.path.basename(annotation) 3355 3356 if not annotation_fields: 3357 annotation_fields = {"INFO": None} 3358 3359 log.debug(f"Annotation '{annotation_name}'") 3360 log.debug( 3361 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3362 ) 3363 3364 # Create Database 3365 database = Database( 3366 database=annotation, 3367 databases_folders=databases_folders, 3368 assembly=assembly, 3369 ) 3370 3371 # Find files 3372 db_file = database.get_database() 3373 db_file = full_path(db_file) 3374 db_hdr_file = database.get_header_file() 3375 db_hdr_file = full_path(db_hdr_file) 3376 db_file_type = database.get_format() 3377 3378 # If db_file is http ? 3379 if database.get_database().startswith("http"): 3380 3381 # Datbase is HTTP URL 3382 db_file_is_http = True 3383 3384 # DB file keep as URL 3385 db_file = database.get_database() 3386 log.warning( 3387 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3388 ) 3389 3390 # Retrieve automatic annotation field name 3391 annotation_field = clean_annotation_field( 3392 os.path.basename(db_file).replace(".bw", "") 3393 ) 3394 log.debug( 3395 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3396 ) 3397 3398 # Create automatic header file 3399 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3400 with open(db_hdr_file, "w") as f: 3401 f.write("##fileformat=VCFv4.2\n") 3402 f.write( 3403 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3404 ) 3405 f.write(f"#CHROM START END {annotation_field}\n") 3406 3407 else: 3408 3409 # Datbase is NOT HTTP URL 3410 db_file_is_http = False 3411 3412 # Check index - try to create if not exists 3413 if ( 3414 db_file is None 3415 or db_hdr_file is None 3416 or (not os.path.exists(db_file) and not db_file_is_http) 3417 or not os.path.exists(db_hdr_file) 3418 or not db_file_type in ["bw"] 3419 ): 3420 # if False: 3421 log.error("Annotation failed: database not valid") 3422 log.error(f"Annotation annotation file: {db_file}") 3423 log.error(f"Annotation annotation file type: {db_file_type}") 3424 log.error(f"Annotation annotation header: {db_hdr_file}") 3425 raise ValueError( 3426 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3427 ) 3428 else: 3429 3430 # Log 3431 log.debug( 3432 f"Annotation '{annotation}' - file: " 3433 + str(db_file) 3434 + " and " 3435 + str(db_hdr_file) 3436 ) 3437 3438 # Load header as VCF object 3439 db_hdr_vcf = Variants(input=db_hdr_file) 3440 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3441 log.debug( 3442 "Annotation database header: " 3443 + str(db_hdr_vcf_header_infos) 3444 ) 3445 3446 # For all fields in database 3447 annotation_fields_full = False 3448 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3449 annotation_fields = { 3450 key: key for key in db_hdr_vcf_header_infos 3451 } 3452 log.debug( 3453 "Annotation database header - All annotations added: " 3454 + str(annotation_fields) 3455 ) 3456 annotation_fields_full = True 3457 3458 # Init 3459 cyvcf2_header_rename_dict = {} 3460 cyvcf2_header_list = [] 3461 cyvcf2_header_indexes = {} 3462 3463 # process annotation fields 3464 for annotation_field in annotation_fields: 3465 3466 # New annotation name 3467 annotation_field_new = annotation_fields[annotation_field] 3468 3469 # Check annotation field and index in header 3470 if ( 3471 annotation_field 3472 in db_hdr_vcf.get_header_columns_as_list() 3473 ): 3474 annotation_field_index = ( 3475 db_hdr_vcf.get_header_columns_as_list().index( 3476 annotation_field 3477 ) 3478 - 3 3479 ) 3480 cyvcf2_header_indexes[annotation_field_new] = ( 3481 annotation_field_index 3482 ) 3483 else: 3484 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Append annotation field in cyvcf2 header list 3489 cyvcf2_header_rename_dict[annotation_field_new] = ( 3490 db_hdr_vcf_header_infos[annotation_field].id 3491 ) 3492 cyvcf2_header_list.append( 3493 { 3494 "ID": annotation_field_new, 3495 "Number": db_hdr_vcf_header_infos[ 3496 annotation_field 3497 ].num, 3498 "Type": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].type, 3501 "Description": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].desc, 3504 } 3505 ) 3506 3507 # Add header on VCF 3508 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3509 annotation_field_new, 3510 db_hdr_vcf_header_infos[annotation_field].num, 3511 db_hdr_vcf_header_infos[annotation_field].type, 3512 db_hdr_vcf_header_infos[annotation_field].desc, 3513 "HOWARD BigWig annotation", 3514 "unknown", 3515 self.code_type_map[ 3516 db_hdr_vcf_header_infos[annotation_field].type 3517 ], 3518 ) 3519 3520 # Load bigwig database 3521 bw_db = pyBigWig.open(db_file) 3522 if bw_db.isBigWig(): 3523 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3524 else: 3525 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3526 log.error(msg_err) 3527 raise ValueError(msg_err) 3528 3529 annotation_bigwig_config_list.append( 3530 { 3531 "db_file": db_file, 3532 "bw_db": bw_db, 3533 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3534 "cyvcf2_header_list": cyvcf2_header_list, 3535 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3536 } 3537 ) 3538 3539 # Annotate 3540 if annotation_bigwig_config_list: 3541 3542 # Annotation config 3543 log.debug( 3544 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3545 ) 3546 3547 # Export VCF file 3548 self.export_variant_vcf( 3549 vcf_file=tmp_vcf_name, 3550 remove_info=True, 3551 add_samples=False, 3552 index=True, 3553 ) 3554 3555 # Load input tmp file 3556 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3557 3558 # Add header in input file 3559 for annotation_bigwig_config in annotation_bigwig_config_list: 3560 for cyvcf2_header_field in annotation_bigwig_config.get( 3561 "cyvcf2_header_list", [] 3562 ): 3563 log.info( 3564 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3565 ) 3566 input_vcf.add_info_to_header(cyvcf2_header_field) 3567 3568 # Create output VCF file 3569 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3570 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3571 3572 # Fetch variants 3573 log.info(f"Annotations 'bigwig' start...") 3574 for variant in input_vcf: 3575 3576 for annotation_bigwig_config in annotation_bigwig_config_list: 3577 3578 # DB and indexes 3579 bw_db = annotation_bigwig_config.get("bw_db", None) 3580 cyvcf2_header_indexes = annotation_bigwig_config.get( 3581 "cyvcf2_header_indexes", None 3582 ) 3583 3584 # Retrieve value from chrom pos 3585 res = bw_db.values( 3586 variant.CHROM, variant.POS - 1, variant.POS 3587 ) 3588 3589 # For each annotation fields (and indexes) 3590 for cyvcf2_header_index in cyvcf2_header_indexes: 3591 3592 # If value is NOT nNone 3593 if not np.isnan( 3594 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3595 ): 3596 variant.INFO[cyvcf2_header_index] = res[ 3597 cyvcf2_header_indexes[cyvcf2_header_index] 3598 ] 3599 3600 # Add record in output file 3601 output_vcf.write_record(variant) 3602 3603 # Log 3604 log.debug(f"Annotation done.") 3605 3606 # Close and write file 3607 log.info(f"Annotations 'bigwig' write...") 3608 output_vcf.close() 3609 log.debug(f"Write done.") 3610 3611 # Update variants 3612 log.info(f"Annotations 'bigwig' update...") 3613 self.update_from_vcf(output_vcf_file) 3614 log.debug(f"Update done.") 3615 3616 return True 3617 3618 def annotation_snpsift(self, threads: int = None) -> None: 3619 """ 3620 This function annotate with bcftools 3621 3622 :param threads: Number of threads to use 3623 :return: the value of the variable "return_value". 3624 """ 3625 3626 # DEBUG 3627 log.debug("Start annotation with bcftools databases") 3628 3629 # Threads 3630 if not threads: 3631 threads = self.get_threads() 3632 log.debug("Threads: " + str(threads)) 3633 3634 # Config 3635 config = self.get_config() 3636 log.debug("Config: " + str(config)) 3637 3638 # Config - snpSift 3639 snpsift_bin_command = get_bin_command( 3640 bin="SnpSift.jar", 3641 tool="snpsift", 3642 bin_type="jar", 3643 config=config, 3644 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3645 ) 3646 if not snpsift_bin_command: 3647 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3648 log.error(msg_err) 3649 raise ValueError(msg_err) 3650 3651 # Config - bcftools 3652 bcftools_bin_command = get_bin_command( 3653 bin="bcftools", 3654 tool="bcftools", 3655 bin_type="bin", 3656 config=config, 3657 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3658 ) 3659 if not bcftools_bin_command: 3660 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3661 log.error(msg_err) 3662 raise ValueError(msg_err) 3663 3664 # Config - BCFTools databases folders 3665 databases_folders = set( 3666 self.get_config() 3667 .get("folders", {}) 3668 .get("databases", {}) 3669 .get("annotations", ["."]) 3670 + self.get_config() 3671 .get("folders", {}) 3672 .get("databases", {}) 3673 .get("bcftools", ["."]) 3674 ) 3675 log.debug("Databases annotations: " + str(databases_folders)) 3676 3677 # Param 3678 annotations = ( 3679 self.get_param() 3680 .get("annotation", {}) 3681 .get("snpsift", {}) 3682 .get("annotations", None) 3683 ) 3684 log.debug("Annotations: " + str(annotations)) 3685 3686 # Assembly 3687 assembly = self.get_param().get( 3688 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3689 ) 3690 3691 # Data 3692 table_variants = self.get_table_variants() 3693 3694 # Check if not empty 3695 log.debug("Check if not empty") 3696 sql_query_chromosomes = ( 3697 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3698 ) 3699 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3700 if not sql_query_chromosomes_df["count"][0]: 3701 log.info(f"VCF empty") 3702 return 3703 3704 # VCF header 3705 vcf_reader = self.get_header() 3706 log.debug("Initial header: " + str(vcf_reader.infos)) 3707 3708 # Existing annotations 3709 for vcf_annotation in self.get_header().infos: 3710 3711 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3712 log.debug( 3713 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3714 ) 3715 3716 if annotations: 3717 3718 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3719 3720 # Export VCF file 3721 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3722 3723 # Init 3724 commands = {} 3725 3726 for annotation in annotations: 3727 annotation_fields = annotations[annotation] 3728 3729 # Annotation Name 3730 annotation_name = os.path.basename(annotation) 3731 3732 if not annotation_fields: 3733 annotation_fields = {"INFO": None} 3734 3735 log.debug(f"Annotation '{annotation_name}'") 3736 log.debug( 3737 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3738 ) 3739 3740 # Create Database 3741 database = Database( 3742 database=annotation, 3743 databases_folders=databases_folders, 3744 assembly=assembly, 3745 ) 3746 3747 # Find files 3748 db_file = database.get_database() 3749 db_file = full_path(db_file) 3750 db_hdr_file = database.get_header_file() 3751 db_hdr_file = full_path(db_hdr_file) 3752 db_file_type = database.get_format() 3753 db_tbi_file = f"{db_file}.tbi" 3754 db_file_compressed = database.is_compressed() 3755 3756 # Check if compressed 3757 if not db_file_compressed: 3758 log.error( 3759 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3760 ) 3761 raise ValueError( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 3765 # Check if indexed 3766 if not os.path.exists(db_tbi_file): 3767 log.error( 3768 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3769 ) 3770 raise ValueError( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 3774 # Check index - try to create if not exists 3775 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3776 log.error("Annotation failed: database not valid") 3777 log.error(f"Annotation annotation file: {db_file}") 3778 log.error(f"Annotation annotation header: {db_hdr_file}") 3779 log.error(f"Annotation annotation index: {db_tbi_file}") 3780 raise ValueError( 3781 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3782 ) 3783 else: 3784 3785 log.debug( 3786 f"Annotation '{annotation}' - file: " 3787 + str(db_file) 3788 + " and " 3789 + str(db_hdr_file) 3790 ) 3791 3792 # Load header as VCF object 3793 db_hdr_vcf = Variants(input=db_hdr_file) 3794 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3795 log.debug( 3796 "Annotation database header: " 3797 + str(db_hdr_vcf_header_infos) 3798 ) 3799 3800 # For all fields in database 3801 annotation_fields_full = False 3802 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3803 annotation_fields = { 3804 key: key for key in db_hdr_vcf_header_infos 3805 } 3806 log.debug( 3807 "Annotation database header - All annotations added: " 3808 + str(annotation_fields) 3809 ) 3810 annotation_fields_full = True 3811 3812 # # Create file for field rename 3813 # log.debug("Create file for field rename") 3814 # tmp_rename = NamedTemporaryFile( 3815 # prefix=self.get_prefix(), 3816 # dir=self.get_tmp_dir(), 3817 # suffix=".rename", 3818 # delete=False, 3819 # ) 3820 # tmp_rename_name = tmp_rename.name 3821 # tmp_files.append(tmp_rename_name) 3822 3823 # Number of fields 3824 nb_annotation_field = 0 3825 annotation_list = [] 3826 annotation_infos_rename_list = [] 3827 3828 for annotation_field in annotation_fields: 3829 3830 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3831 annotation_fields_new_name = annotation_fields.get( 3832 annotation_field, annotation_field 3833 ) 3834 if not annotation_fields_new_name: 3835 annotation_fields_new_name = annotation_field 3836 3837 # Check if field is in DB and if field is not elready in input data 3838 if ( 3839 annotation_field in db_hdr_vcf.get_header().infos 3840 and annotation_fields_new_name 3841 not in self.get_header().infos 3842 ): 3843 3844 log.info( 3845 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3846 ) 3847 3848 # BCFTools annotate param to rename fields 3849 if annotation_field != annotation_fields_new_name: 3850 annotation_infos_rename_list.append( 3851 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3852 ) 3853 3854 # Add INFO field to header 3855 db_hdr_vcf_header_infos_number = ( 3856 db_hdr_vcf_header_infos[annotation_field].num or "." 3857 ) 3858 db_hdr_vcf_header_infos_type = ( 3859 db_hdr_vcf_header_infos[annotation_field].type 3860 or "String" 3861 ) 3862 db_hdr_vcf_header_infos_description = ( 3863 db_hdr_vcf_header_infos[annotation_field].desc 3864 or f"{annotation_field} description" 3865 ) 3866 db_hdr_vcf_header_infos_source = ( 3867 db_hdr_vcf_header_infos[annotation_field].source 3868 or "unknown" 3869 ) 3870 db_hdr_vcf_header_infos_version = ( 3871 db_hdr_vcf_header_infos[annotation_field].version 3872 or "unknown" 3873 ) 3874 3875 vcf_reader.infos[annotation_fields_new_name] = ( 3876 vcf.parser._Info( 3877 annotation_fields_new_name, 3878 db_hdr_vcf_header_infos_number, 3879 db_hdr_vcf_header_infos_type, 3880 db_hdr_vcf_header_infos_description, 3881 db_hdr_vcf_header_infos_source, 3882 db_hdr_vcf_header_infos_version, 3883 self.code_type_map[ 3884 db_hdr_vcf_header_infos_type 3885 ], 3886 ) 3887 ) 3888 3889 annotation_list.append(annotation_field) 3890 3891 nb_annotation_field += 1 3892 3893 else: 3894 3895 if ( 3896 annotation_field 3897 not in db_hdr_vcf.get_header().infos 3898 ): 3899 log.warning( 3900 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3901 ) 3902 if ( 3903 annotation_fields_new_name 3904 in self.get_header().infos 3905 ): 3906 log.warning( 3907 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3908 ) 3909 3910 log.info( 3911 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3912 ) 3913 3914 annotation_infos = ",".join(annotation_list) 3915 3916 if annotation_infos != "": 3917 3918 # Annotated VCF (and error file) 3919 tmp_annotation_vcf_name = os.path.join( 3920 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3921 ) 3922 tmp_annotation_vcf_name_err = ( 3923 tmp_annotation_vcf_name + ".err" 3924 ) 3925 3926 # Add fields to annotate 3927 if not annotation_fields_full: 3928 annotation_infos_option = f"-info {annotation_infos}" 3929 else: 3930 annotation_infos_option = "" 3931 3932 # Info fields rename 3933 if annotation_infos_rename_list: 3934 annotation_infos_rename = " -c " + ",".join( 3935 annotation_infos_rename_list 3936 ) 3937 else: 3938 annotation_infos_rename = "" 3939 3940 # Annotate command 3941 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3942 3943 # Add command 3944 commands[command_annotate] = tmp_annotation_vcf_name 3945 3946 if commands: 3947 3948 # Export VCF file 3949 self.export_variant_vcf( 3950 vcf_file=tmp_vcf_name, 3951 remove_info=True, 3952 add_samples=False, 3953 index=True, 3954 ) 3955 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3956 3957 # Num command 3958 nb_command = 0 3959 3960 # Annotate 3961 for command_annotate in commands: 3962 nb_command += 1 3963 log.info( 3964 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3965 ) 3966 log.debug(f"command_annotate={command_annotate}") 3967 run_parallel_commands([command_annotate], threads) 3968 3969 # Debug 3970 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3971 3972 # Update variants 3973 log.info( 3974 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3975 ) 3976 self.update_from_vcf(commands[command_annotate]) 3977 3978 def annotation_bcftools(self, threads: int = None) -> None: 3979 """ 3980 This function annotate with bcftools 3981 3982 :param threads: Number of threads to use 3983 :return: the value of the variable "return_value". 3984 """ 3985 3986 # DEBUG 3987 log.debug("Start annotation with bcftools databases") 3988 3989 # Threads 3990 if not threads: 3991 threads = self.get_threads() 3992 log.debug("Threads: " + str(threads)) 3993 3994 # Config 3995 config = self.get_config() 3996 log.debug("Config: " + str(config)) 3997 3998 # DEBUG 3999 delete_tmp = True 4000 if self.get_config().get("verbosity", "warning") in ["debug"]: 4001 delete_tmp = False 4002 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4003 4004 # Config - BCFTools bin command 4005 bcftools_bin_command = get_bin_command( 4006 bin="bcftools", 4007 tool="bcftools", 4008 bin_type="bin", 4009 config=config, 4010 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4011 ) 4012 if not bcftools_bin_command: 4013 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4014 log.error(msg_err) 4015 raise ValueError(msg_err) 4016 4017 # Config - BCFTools databases folders 4018 databases_folders = set( 4019 self.get_config() 4020 .get("folders", {}) 4021 .get("databases", {}) 4022 .get("annotations", ["."]) 4023 + self.get_config() 4024 .get("folders", {}) 4025 .get("databases", {}) 4026 .get("bcftools", ["."]) 4027 ) 4028 log.debug("Databases annotations: " + str(databases_folders)) 4029 4030 # Param 4031 annotations = ( 4032 self.get_param() 4033 .get("annotation", {}) 4034 .get("bcftools", {}) 4035 .get("annotations", None) 4036 ) 4037 log.debug("Annotations: " + str(annotations)) 4038 4039 # Assembly 4040 assembly = self.get_param().get( 4041 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4042 ) 4043 4044 # Data 4045 table_variants = self.get_table_variants() 4046 4047 # Check if not empty 4048 log.debug("Check if not empty") 4049 sql_query_chromosomes = ( 4050 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4051 ) 4052 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4053 if not sql_query_chromosomes_df["count"][0]: 4054 log.info(f"VCF empty") 4055 return 4056 4057 # Export in VCF 4058 log.debug("Create initial file to annotate") 4059 tmp_vcf = NamedTemporaryFile( 4060 prefix=self.get_prefix(), 4061 dir=self.get_tmp_dir(), 4062 suffix=".vcf.gz", 4063 delete=False, 4064 ) 4065 tmp_vcf_name = tmp_vcf.name 4066 4067 # VCF header 4068 vcf_reader = self.get_header() 4069 log.debug("Initial header: " + str(vcf_reader.infos)) 4070 4071 # Existing annotations 4072 for vcf_annotation in self.get_header().infos: 4073 4074 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4075 log.debug( 4076 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4077 ) 4078 4079 if annotations: 4080 4081 tmp_ann_vcf_list = [] 4082 commands = [] 4083 tmp_files = [] 4084 err_files = [] 4085 4086 for annotation in annotations: 4087 annotation_fields = annotations[annotation] 4088 4089 # Annotation Name 4090 annotation_name = os.path.basename(annotation) 4091 4092 if not annotation_fields: 4093 annotation_fields = {"INFO": None} 4094 4095 log.debug(f"Annotation '{annotation_name}'") 4096 log.debug( 4097 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4098 ) 4099 4100 # Create Database 4101 database = Database( 4102 database=annotation, 4103 databases_folders=databases_folders, 4104 assembly=assembly, 4105 ) 4106 4107 # Find files 4108 db_file = database.get_database() 4109 db_file = full_path(db_file) 4110 db_hdr_file = database.get_header_file() 4111 db_hdr_file = full_path(db_hdr_file) 4112 db_file_type = database.get_format() 4113 db_tbi_file = f"{db_file}.tbi" 4114 db_file_compressed = database.is_compressed() 4115 4116 # Check if compressed 4117 if not db_file_compressed: 4118 log.error( 4119 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4120 ) 4121 raise ValueError( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 4125 # Check if indexed 4126 if not os.path.exists(db_tbi_file): 4127 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4128 raise ValueError( 4129 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4130 ) 4131 4132 # Check index - try to create if not exists 4133 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4134 log.error("Annotation failed: database not valid") 4135 log.error(f"Annotation annotation file: {db_file}") 4136 log.error(f"Annotation annotation header: {db_hdr_file}") 4137 log.error(f"Annotation annotation index: {db_tbi_file}") 4138 raise ValueError( 4139 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4140 ) 4141 else: 4142 4143 log.debug( 4144 f"Annotation '{annotation}' - file: " 4145 + str(db_file) 4146 + " and " 4147 + str(db_hdr_file) 4148 ) 4149 4150 # Load header as VCF object 4151 db_hdr_vcf = Variants(input=db_hdr_file) 4152 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4153 log.debug( 4154 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4155 ) 4156 4157 # For all fields in database 4158 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4159 annotation_fields = { 4160 key: key for key in db_hdr_vcf_header_infos 4161 } 4162 log.debug( 4163 "Annotation database header - All annotations added: " 4164 + str(annotation_fields) 4165 ) 4166 4167 # Number of fields 4168 nb_annotation_field = 0 4169 annotation_list = [] 4170 4171 for annotation_field in annotation_fields: 4172 4173 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4174 annotation_fields_new_name = annotation_fields.get( 4175 annotation_field, annotation_field 4176 ) 4177 if not annotation_fields_new_name: 4178 annotation_fields_new_name = annotation_field 4179 4180 # Check if field is in DB and if field is not elready in input data 4181 if ( 4182 annotation_field in db_hdr_vcf.get_header().infos 4183 and annotation_fields_new_name 4184 not in self.get_header().infos 4185 ): 4186 4187 log.info( 4188 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4189 ) 4190 4191 # Add INFO field to header 4192 db_hdr_vcf_header_infos_number = ( 4193 db_hdr_vcf_header_infos[annotation_field].num or "." 4194 ) 4195 db_hdr_vcf_header_infos_type = ( 4196 db_hdr_vcf_header_infos[annotation_field].type 4197 or "String" 4198 ) 4199 db_hdr_vcf_header_infos_description = ( 4200 db_hdr_vcf_header_infos[annotation_field].desc 4201 or f"{annotation_field} description" 4202 ) 4203 db_hdr_vcf_header_infos_source = ( 4204 db_hdr_vcf_header_infos[annotation_field].source 4205 or "unknown" 4206 ) 4207 db_hdr_vcf_header_infos_version = ( 4208 db_hdr_vcf_header_infos[annotation_field].version 4209 or "unknown" 4210 ) 4211 4212 vcf_reader.infos[annotation_fields_new_name] = ( 4213 vcf.parser._Info( 4214 annotation_fields_new_name, 4215 db_hdr_vcf_header_infos_number, 4216 db_hdr_vcf_header_infos_type, 4217 db_hdr_vcf_header_infos_description, 4218 db_hdr_vcf_header_infos_source, 4219 db_hdr_vcf_header_infos_version, 4220 self.code_type_map[db_hdr_vcf_header_infos_type], 4221 ) 4222 ) 4223 4224 # annotation_list.append(annotation_field) 4225 if annotation_field != annotation_fields_new_name: 4226 annotation_list.append( 4227 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4228 ) 4229 else: 4230 annotation_list.append(annotation_field) 4231 4232 nb_annotation_field += 1 4233 4234 else: 4235 4236 if annotation_field not in db_hdr_vcf.get_header().infos: 4237 log.warning( 4238 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4239 ) 4240 if annotation_fields_new_name in self.get_header().infos: 4241 log.warning( 4242 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4243 ) 4244 4245 log.info( 4246 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4247 ) 4248 4249 annotation_infos = ",".join(annotation_list) 4250 4251 if annotation_infos != "": 4252 4253 # Protect header for bcftools (remove "#CHROM" and variants line) 4254 log.debug("Protect Header file - remove #CHROM line if exists") 4255 tmp_header_vcf = NamedTemporaryFile( 4256 prefix=self.get_prefix(), 4257 dir=self.get_tmp_dir(), 4258 suffix=".hdr", 4259 delete=False, 4260 ) 4261 tmp_header_vcf_name = tmp_header_vcf.name 4262 tmp_files.append(tmp_header_vcf_name) 4263 # Command 4264 if db_hdr_file.endswith(".gz"): 4265 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4266 else: 4267 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 # Run 4269 run_parallel_commands([command_extract_header], 1) 4270 4271 # Find chomosomes 4272 log.debug("Find chromosomes ") 4273 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4274 sql_query_chromosomes_df = self.get_query_to_df( 4275 sql_query_chromosomes 4276 ) 4277 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4278 4279 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4280 4281 # BED columns in the annotation file 4282 if db_file_type in ["bed"]: 4283 annotation_infos = "CHROM,POS,POS," + annotation_infos 4284 4285 for chrom in chomosomes_list: 4286 4287 # Create BED on initial VCF 4288 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4289 tmp_bed = NamedTemporaryFile( 4290 prefix=self.get_prefix(), 4291 dir=self.get_tmp_dir(), 4292 suffix=".bed", 4293 delete=False, 4294 ) 4295 tmp_bed_name = tmp_bed.name 4296 tmp_files.append(tmp_bed_name) 4297 4298 # Detecte regions 4299 log.debug( 4300 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4301 ) 4302 window = 1000000 4303 sql_query_intervals_for_bed = f""" 4304 SELECT \"#CHROM\", 4305 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4306 \"POS\"+{window} 4307 FROM {table_variants} as table_variants 4308 WHERE table_variants.\"#CHROM\" = '{chrom}' 4309 """ 4310 regions = self.conn.execute( 4311 sql_query_intervals_for_bed 4312 ).fetchall() 4313 merged_regions = merge_regions(regions) 4314 log.debug( 4315 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4316 ) 4317 4318 header = ["#CHROM", "START", "END"] 4319 with open(tmp_bed_name, "w") as f: 4320 # Write the header with tab delimiter 4321 f.write("\t".join(header) + "\n") 4322 for d in merged_regions: 4323 # Write each data row with tab delimiter 4324 f.write("\t".join(map(str, d)) + "\n") 4325 4326 # Tmp files 4327 tmp_annotation_vcf = NamedTemporaryFile( 4328 prefix=self.get_prefix(), 4329 dir=self.get_tmp_dir(), 4330 suffix=".vcf.gz", 4331 delete=False, 4332 ) 4333 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4334 tmp_files.append(tmp_annotation_vcf_name) 4335 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4336 tmp_annotation_vcf_name_err = ( 4337 tmp_annotation_vcf_name + ".err" 4338 ) 4339 err_files.append(tmp_annotation_vcf_name_err) 4340 4341 # Annotate Command 4342 log.debug( 4343 f"Annotation '{annotation}' - add bcftools command" 4344 ) 4345 4346 # Command 4347 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4348 4349 # Add command 4350 commands.append(command_annotate) 4351 4352 # if some commands 4353 if commands: 4354 4355 # Export VCF file 4356 self.export_variant_vcf( 4357 vcf_file=tmp_vcf_name, 4358 remove_info=True, 4359 add_samples=False, 4360 index=True, 4361 ) 4362 4363 # Threads 4364 # calculate threads for annotated commands 4365 if commands: 4366 threads_bcftools_annotate = round(threads / len(commands)) 4367 else: 4368 threads_bcftools_annotate = 1 4369 4370 if not threads_bcftools_annotate: 4371 threads_bcftools_annotate = 1 4372 4373 # Add threads option to bcftools commands 4374 if threads_bcftools_annotate > 1: 4375 commands_threaded = [] 4376 for command in commands: 4377 commands_threaded.append( 4378 command.replace( 4379 f"{bcftools_bin_command} annotate ", 4380 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4381 ) 4382 ) 4383 commands = commands_threaded 4384 4385 # Command annotation multithreading 4386 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4387 log.info( 4388 f"Annotation - Annotation multithreaded in " 4389 + str(len(commands)) 4390 + " commands" 4391 ) 4392 4393 run_parallel_commands(commands, threads) 4394 4395 # Merge 4396 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4397 4398 if tmp_ann_vcf_list_cmd: 4399 4400 # Tmp file 4401 tmp_annotate_vcf = NamedTemporaryFile( 4402 prefix=self.get_prefix(), 4403 dir=self.get_tmp_dir(), 4404 suffix=".vcf.gz", 4405 delete=True, 4406 ) 4407 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4408 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4409 err_files.append(tmp_annotate_vcf_name_err) 4410 4411 # Tmp file remove command 4412 tmp_files_remove_command = "" 4413 if tmp_files: 4414 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4415 4416 # Command merge 4417 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4418 log.info( 4419 f"Annotation - Annotation merging " 4420 + str(len(commands)) 4421 + " annotated files" 4422 ) 4423 log.debug(f"Annotation - merge command: {merge_command}") 4424 run_parallel_commands([merge_command], 1) 4425 4426 # Error messages 4427 log.info(f"Error/Warning messages:") 4428 error_message_command_all = [] 4429 error_message_command_warning = [] 4430 error_message_command_err = [] 4431 for err_file in err_files: 4432 with open(err_file, "r") as f: 4433 for line in f: 4434 message = line.strip() 4435 error_message_command_all.append(message) 4436 if line.startswith("[W::"): 4437 error_message_command_warning.append(message) 4438 if line.startswith("[E::"): 4439 error_message_command_err.append( 4440 f"{err_file}: " + message 4441 ) 4442 # log info 4443 for message in list( 4444 set(error_message_command_err + error_message_command_warning) 4445 ): 4446 log.info(f" {message}") 4447 # debug info 4448 for message in list(set(error_message_command_all)): 4449 log.debug(f" {message}") 4450 # failed 4451 if len(error_message_command_err): 4452 log.error("Annotation failed: Error in commands") 4453 raise ValueError("Annotation failed: Error in commands") 4454 4455 # Update variants 4456 log.info(f"Annotation - Updating...") 4457 self.update_from_vcf(tmp_annotate_vcf_name) 4458 4459 def annotation_exomiser(self, threads: int = None) -> None: 4460 """ 4461 This function annotate with Exomiser 4462 4463 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4464 - "analysis" (dict/file): 4465 Full analysis dictionnary parameters (see Exomiser docs). 4466 Either a dict, or a file in JSON or YAML format. 4467 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4468 Default : None 4469 - "preset" (string): 4470 Analysis preset (available in config folder). 4471 Used if no full "analysis" is provided. 4472 Default: "exome" 4473 - "phenopacket" (dict/file): 4474 Samples and phenotipic features parameters (see Exomiser docs). 4475 Either a dict, or a file in JSON or YAML format. 4476 Default: None 4477 - "subject" (dict): 4478 Sample parameters (see Exomiser docs). 4479 Example: 4480 "subject": 4481 { 4482 "id": "ISDBM322017", 4483 "sex": "FEMALE" 4484 } 4485 Default: None 4486 - "sample" (string): 4487 Sample name to construct "subject" section: 4488 "subject": 4489 { 4490 "id": "<sample>", 4491 "sex": "UNKNOWN_SEX" 4492 } 4493 Default: None 4494 - "phenotypicFeatures" (dict) 4495 Phenotypic features to construct "subject" section. 4496 Example: 4497 "phenotypicFeatures": 4498 [ 4499 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4500 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4501 ] 4502 - "hpo" (list) 4503 List of HPO ids as phenotypic features. 4504 Example: 4505 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4506 Default: [] 4507 - "outputOptions" (dict): 4508 Output options (see Exomiser docs). 4509 Default: 4510 "output_options" = 4511 { 4512 "outputContributingVariantsOnly": False, 4513 "numGenes": 0, 4514 "outputFormats": ["TSV_VARIANT", "VCF"] 4515 } 4516 - "transcript_source" (string): 4517 Transcript source (either "refseq", "ucsc", "ensembl") 4518 Default: "refseq" 4519 - "exomiser_to_info" (boolean): 4520 Add exomiser TSV file columns as INFO fields in VCF. 4521 Default: False 4522 - "release" (string): 4523 Exomise database release. 4524 If not exists, database release will be downloaded (take a while). 4525 Default: None (provided by application.properties configuration file) 4526 - "exomiser_application_properties" (file): 4527 Exomiser configuration file (see Exomiser docs). 4528 Useful to automatically download databases (especially for specific genome databases). 4529 4530 Notes: 4531 - If no sample in parameters, first sample in VCF will be chosen 4532 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4533 4534 :param threads: The number of threads to use 4535 :return: None. 4536 """ 4537 4538 # DEBUG 4539 log.debug("Start annotation with Exomiser databases") 4540 4541 # Threads 4542 if not threads: 4543 threads = self.get_threads() 4544 log.debug("Threads: " + str(threads)) 4545 4546 # Config 4547 config = self.get_config() 4548 log.debug("Config: " + str(config)) 4549 4550 # Config - Folders - Databases 4551 databases_folders = ( 4552 config.get("folders", {}) 4553 .get("databases", {}) 4554 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4555 ) 4556 databases_folders = full_path(databases_folders) 4557 if not os.path.exists(databases_folders): 4558 log.error(f"Databases annotations: {databases_folders} NOT found") 4559 log.debug("Databases annotations: " + str(databases_folders)) 4560 4561 # Config - Exomiser 4562 exomiser_bin_command = get_bin_command( 4563 bin="exomiser-cli*.jar", 4564 tool="exomiser", 4565 bin_type="jar", 4566 config=config, 4567 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4568 ) 4569 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4570 if not exomiser_bin_command: 4571 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4572 log.error(msg_err) 4573 raise ValueError(msg_err) 4574 4575 # Param 4576 param = self.get_param() 4577 log.debug("Param: " + str(param)) 4578 4579 # Param - Exomiser 4580 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4581 log.debug(f"Param Exomiser: {param_exomiser}") 4582 4583 # Param - Assembly 4584 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4585 log.debug("Assembly: " + str(assembly)) 4586 4587 # Data 4588 table_variants = self.get_table_variants() 4589 4590 # Check if not empty 4591 log.debug("Check if not empty") 4592 sql_query_chromosomes = ( 4593 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4594 ) 4595 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4596 log.info(f"VCF empty") 4597 return False 4598 4599 # VCF header 4600 vcf_reader = self.get_header() 4601 log.debug("Initial header: " + str(vcf_reader.infos)) 4602 4603 # Samples 4604 samples = self.get_header_sample_list() 4605 if not samples: 4606 log.error("No Samples in VCF") 4607 return False 4608 log.debug(f"Samples: {samples}") 4609 4610 # Memory limit 4611 memory_limit = self.get_memory("8G") 4612 log.debug(f"memory_limit: {memory_limit}") 4613 4614 # Exomiser java options 4615 exomiser_java_options = ( 4616 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4617 ) 4618 log.debug(f"Exomiser java options: {exomiser_java_options}") 4619 4620 # Download Exomiser (if not exists) 4621 exomiser_release = param_exomiser.get("release", None) 4622 exomiser_application_properties = param_exomiser.get( 4623 "exomiser_application_properties", None 4624 ) 4625 databases_download_exomiser( 4626 assemblies=[assembly], 4627 exomiser_folder=databases_folders, 4628 exomiser_release=exomiser_release, 4629 exomiser_phenotype_release=exomiser_release, 4630 exomiser_application_properties=exomiser_application_properties, 4631 ) 4632 4633 # Force annotation 4634 force_update_annotation = True 4635 4636 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4637 log.debug("Start annotation Exomiser") 4638 4639 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4640 4641 # tmp_dir = "/tmp/exomiser" 4642 4643 ### ANALYSIS ### 4644 ################ 4645 4646 # Create analysis.json through analysis dict 4647 # either analysis in param or by default 4648 # depending on preset exome/genome) 4649 4650 # Init analysis dict 4651 param_exomiser_analysis_dict = {} 4652 4653 # analysis from param 4654 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4655 param_exomiser_analysis = full_path(param_exomiser_analysis) 4656 4657 # If analysis in param -> load anlaysis json 4658 if param_exomiser_analysis: 4659 4660 # If param analysis is a file and exists 4661 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4662 param_exomiser_analysis 4663 ): 4664 # Load analysis file into analysis dict (either yaml or json) 4665 with open(param_exomiser_analysis) as json_file: 4666 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4667 4668 # If param analysis is a dict 4669 elif isinstance(param_exomiser_analysis, dict): 4670 # Load analysis dict into analysis dict (either yaml or json) 4671 param_exomiser_analysis_dict = param_exomiser_analysis 4672 4673 # Error analysis type 4674 else: 4675 log.error(f"Analysis type unknown. Check param file.") 4676 raise ValueError(f"Analysis type unknown. Check param file.") 4677 4678 # Case no input analysis config file/dict 4679 # Use preset (exome/genome) to open default config file 4680 if not param_exomiser_analysis_dict: 4681 4682 # default preset 4683 default_preset = "exome" 4684 4685 # Get param preset or default preset 4686 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4687 4688 # Try to find if preset is a file 4689 if os.path.exists(param_exomiser_preset): 4690 # Preset file is provided in full path 4691 param_exomiser_analysis_default_config_file = ( 4692 param_exomiser_preset 4693 ) 4694 # elif os.path.exists(full_path(param_exomiser_preset)): 4695 # # Preset file is provided in full path 4696 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4697 elif os.path.exists( 4698 os.path.join(folder_config, param_exomiser_preset) 4699 ): 4700 # Preset file is provided a basename in config folder (can be a path with subfolders) 4701 param_exomiser_analysis_default_config_file = os.path.join( 4702 folder_config, param_exomiser_preset 4703 ) 4704 else: 4705 # Construct preset file 4706 param_exomiser_analysis_default_config_file = os.path.join( 4707 folder_config, 4708 f"preset-{param_exomiser_preset}-analysis.json", 4709 ) 4710 4711 # If preset file exists 4712 param_exomiser_analysis_default_config_file = full_path( 4713 param_exomiser_analysis_default_config_file 4714 ) 4715 if os.path.exists(param_exomiser_analysis_default_config_file): 4716 # Load prest file into analysis dict (either yaml or json) 4717 with open( 4718 param_exomiser_analysis_default_config_file 4719 ) as json_file: 4720 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4721 json_file 4722 ) 4723 4724 # Error preset file 4725 else: 4726 log.error( 4727 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4728 ) 4729 raise ValueError( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 4733 # If no analysis dict created 4734 if not param_exomiser_analysis_dict: 4735 log.error(f"No analysis config") 4736 raise ValueError(f"No analysis config") 4737 4738 # Log 4739 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4740 4741 ### PHENOPACKET ### 4742 ################### 4743 4744 # If no PhenoPacket in analysis dict -> check in param 4745 if "phenopacket" not in param_exomiser_analysis_dict: 4746 4747 # If PhenoPacket in param -> load anlaysis json 4748 if param_exomiser.get("phenopacket", None): 4749 4750 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4751 param_exomiser_phenopacket = full_path( 4752 param_exomiser_phenopacket 4753 ) 4754 4755 # If param phenopacket is a file and exists 4756 if isinstance( 4757 param_exomiser_phenopacket, str 4758 ) and os.path.exists(param_exomiser_phenopacket): 4759 # Load phenopacket file into analysis dict (either yaml or json) 4760 with open(param_exomiser_phenopacket) as json_file: 4761 param_exomiser_analysis_dict["phenopacket"] = ( 4762 yaml.safe_load(json_file) 4763 ) 4764 4765 # If param phenopacket is a dict 4766 elif isinstance(param_exomiser_phenopacket, dict): 4767 # Load phenopacket dict into analysis dict (either yaml or json) 4768 param_exomiser_analysis_dict["phenopacket"] = ( 4769 param_exomiser_phenopacket 4770 ) 4771 4772 # Error phenopacket type 4773 else: 4774 log.error(f"Phenopacket type unknown. Check param file.") 4775 raise ValueError( 4776 f"Phenopacket type unknown. Check param file." 4777 ) 4778 4779 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4780 if "phenopacket" not in param_exomiser_analysis_dict: 4781 4782 # Init PhenoPacket 4783 param_exomiser_analysis_dict["phenopacket"] = { 4784 "id": "analysis", 4785 "proband": {}, 4786 } 4787 4788 ### Add subject ### 4789 4790 # If subject exists 4791 param_exomiser_subject = param_exomiser.get("subject", {}) 4792 4793 # If subject not exists -> found sample ID 4794 if not param_exomiser_subject: 4795 4796 # Found sample ID in param 4797 sample = param_exomiser.get("sample", None) 4798 4799 # Find sample ID (first sample) 4800 if not sample: 4801 sample_list = self.get_header_sample_list() 4802 if len(sample_list) > 0: 4803 sample = sample_list[0] 4804 else: 4805 log.error(f"No sample found") 4806 raise ValueError(f"No sample found") 4807 4808 # Create subject 4809 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4810 4811 # Add to dict 4812 param_exomiser_analysis_dict["phenopacket"][ 4813 "subject" 4814 ] = param_exomiser_subject 4815 4816 ### Add "phenotypicFeatures" ### 4817 4818 # If phenotypicFeatures exists 4819 param_exomiser_phenotypicfeatures = param_exomiser.get( 4820 "phenotypicFeatures", [] 4821 ) 4822 4823 # If phenotypicFeatures not exists -> Try to infer from hpo list 4824 if not param_exomiser_phenotypicfeatures: 4825 4826 # Found HPO in param 4827 param_exomiser_hpo = param_exomiser.get("hpo", []) 4828 4829 # Split HPO if list in string format separated by comma 4830 if isinstance(param_exomiser_hpo, str): 4831 param_exomiser_hpo = param_exomiser_hpo.split(",") 4832 4833 # Create HPO list 4834 for hpo in param_exomiser_hpo: 4835 hpo_clean = re.sub("[^0-9]", "", hpo) 4836 param_exomiser_phenotypicfeatures.append( 4837 { 4838 "type": { 4839 "id": f"HP:{hpo_clean}", 4840 "label": f"HP:{hpo_clean}", 4841 } 4842 } 4843 ) 4844 4845 # Add to dict 4846 param_exomiser_analysis_dict["phenopacket"][ 4847 "phenotypicFeatures" 4848 ] = param_exomiser_phenotypicfeatures 4849 4850 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4851 if not param_exomiser_phenotypicfeatures: 4852 for step in param_exomiser_analysis_dict.get( 4853 "analysis", {} 4854 ).get("steps", []): 4855 if "hiPhivePrioritiser" in step: 4856 param_exomiser_analysis_dict.get("analysis", {}).get( 4857 "steps", [] 4858 ).remove(step) 4859 4860 ### Add Input File ### 4861 4862 # Initial file name and htsFiles 4863 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4864 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4865 { 4866 "uri": tmp_vcf_name, 4867 "htsFormat": "VCF", 4868 "genomeAssembly": assembly, 4869 } 4870 ] 4871 4872 ### Add metaData ### 4873 4874 # If metaData not in analysis dict 4875 if "metaData" not in param_exomiser_analysis_dict: 4876 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4877 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4878 "createdBy": "howard", 4879 "phenopacketSchemaVersion": 1, 4880 } 4881 4882 ### OutputOptions ### 4883 4884 # Init output result folder 4885 output_results = os.path.join(tmp_dir, "results") 4886 4887 # If no outputOptions in analysis dict 4888 if "outputOptions" not in param_exomiser_analysis_dict: 4889 4890 # default output formats 4891 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4892 4893 # Get outputOptions in param 4894 output_options = param_exomiser.get("outputOptions", None) 4895 4896 # If no output_options in param -> check 4897 if not output_options: 4898 output_options = { 4899 "outputContributingVariantsOnly": False, 4900 "numGenes": 0, 4901 "outputFormats": defaut_output_formats, 4902 } 4903 4904 # Replace outputDirectory in output options 4905 output_options["outputDirectory"] = output_results 4906 output_options["outputFileName"] = "howard" 4907 4908 # Add outputOptions in analysis dict 4909 param_exomiser_analysis_dict["outputOptions"] = output_options 4910 4911 else: 4912 4913 # Replace output_results and output format (if exists in param) 4914 param_exomiser_analysis_dict["outputOptions"][ 4915 "outputDirectory" 4916 ] = output_results 4917 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4918 list( 4919 set( 4920 param_exomiser_analysis_dict.get( 4921 "outputOptions", {} 4922 ).get("outputFormats", []) 4923 + ["TSV_VARIANT", "VCF"] 4924 ) 4925 ) 4926 ) 4927 4928 # log 4929 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4930 4931 ### ANALYSIS FILE ### 4932 ##################### 4933 4934 ### Full JSON analysis config file ### 4935 4936 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4937 with open(exomiser_analysis, "w") as fp: 4938 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4939 4940 ### SPLIT analysis and sample config files 4941 4942 # Splitted analysis dict 4943 param_exomiser_analysis_dict_for_split = ( 4944 param_exomiser_analysis_dict.copy() 4945 ) 4946 4947 # Phenopacket JSON file 4948 exomiser_analysis_phenopacket = os.path.join( 4949 tmp_dir, "analysis_phenopacket.json" 4950 ) 4951 with open(exomiser_analysis_phenopacket, "w") as fp: 4952 json.dump( 4953 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4954 fp, 4955 indent=4, 4956 ) 4957 4958 # Analysis JSON file without Phenopacket parameters 4959 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4960 exomiser_analysis_analysis = os.path.join( 4961 tmp_dir, "analysis_analysis.json" 4962 ) 4963 with open(exomiser_analysis_analysis, "w") as fp: 4964 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4965 4966 ### INITAL VCF file ### 4967 ####################### 4968 4969 ### Create list of samples to use and include inti initial VCF file #### 4970 4971 # Subject (main sample) 4972 # Get sample ID in analysis dict 4973 sample_subject = ( 4974 param_exomiser_analysis_dict.get("phenopacket", {}) 4975 .get("subject", {}) 4976 .get("id", None) 4977 ) 4978 sample_proband = ( 4979 param_exomiser_analysis_dict.get("phenopacket", {}) 4980 .get("proband", {}) 4981 .get("subject", {}) 4982 .get("id", None) 4983 ) 4984 sample = [] 4985 if sample_subject: 4986 sample.append(sample_subject) 4987 if sample_proband: 4988 sample.append(sample_proband) 4989 4990 # Get sample ID within Pedigree 4991 pedigree_persons_list = ( 4992 param_exomiser_analysis_dict.get("phenopacket", {}) 4993 .get("pedigree", {}) 4994 .get("persons", {}) 4995 ) 4996 4997 # Create list with all sample ID in pedigree (if exists) 4998 pedigree_persons = [] 4999 for person in pedigree_persons_list: 5000 pedigree_persons.append(person.get("individualId")) 5001 5002 # Concat subject sample ID and samples ID in pedigreesamples 5003 samples = list(set(sample + pedigree_persons)) 5004 5005 # Check if sample list is not empty 5006 if not samples: 5007 log.error(f"No samples found") 5008 raise ValueError(f"No samples found") 5009 5010 # Create VCF with sample (either sample in param or first one by default) 5011 # Export VCF file 5012 self.export_variant_vcf( 5013 vcf_file=tmp_vcf_name, 5014 remove_info=True, 5015 add_samples=True, 5016 list_samples=samples, 5017 index=False, 5018 ) 5019 5020 ### Execute Exomiser ### 5021 ######################## 5022 5023 # Init command 5024 exomiser_command = "" 5025 5026 # Command exomiser options 5027 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5028 5029 # Release 5030 exomiser_release = param_exomiser.get("release", None) 5031 if exomiser_release: 5032 # phenotype data version 5033 exomiser_options += ( 5034 f" --exomiser.phenotype.data-version={exomiser_release} " 5035 ) 5036 # data version 5037 exomiser_options += ( 5038 f" --exomiser.{assembly}.data-version={exomiser_release} " 5039 ) 5040 # variant white list 5041 variant_white_list_file = ( 5042 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5043 ) 5044 if os.path.exists( 5045 os.path.join( 5046 databases_folders, assembly, variant_white_list_file 5047 ) 5048 ): 5049 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5050 5051 # transcript_source 5052 transcript_source = param_exomiser.get( 5053 "transcript_source", None 5054 ) # ucsc, refseq, ensembl 5055 if transcript_source: 5056 exomiser_options += ( 5057 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5058 ) 5059 5060 # If analysis contain proband param 5061 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5062 "proband", {} 5063 ): 5064 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5065 5066 # If no proband (usually uniq sample) 5067 else: 5068 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5069 5070 # Log 5071 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5072 5073 # Run command 5074 result = subprocess.call( 5075 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5076 ) 5077 if result: 5078 log.error("Exomiser command failed") 5079 raise ValueError("Exomiser command failed") 5080 5081 ### RESULTS ### 5082 ############### 5083 5084 ### Annotate with TSV fields ### 5085 5086 # Init result tsv file 5087 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5088 5089 # Init result tsv file 5090 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5091 5092 # Parse TSV file and explode columns in INFO field 5093 if exomiser_to_info and os.path.exists(output_results_tsv): 5094 5095 # Log 5096 log.debug("Exomiser columns to VCF INFO field") 5097 5098 # Retrieve columns and types 5099 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5100 output_results_tsv_df = self.get_query_to_df(query) 5101 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5102 5103 # Init concat fields for update 5104 sql_query_update_concat_fields = [] 5105 5106 # Fields to avoid 5107 fields_to_avoid = [ 5108 "CONTIG", 5109 "START", 5110 "END", 5111 "REF", 5112 "ALT", 5113 "QUAL", 5114 "FILTER", 5115 "GENOTYPE", 5116 ] 5117 5118 # List all columns to add into header 5119 for header_column in output_results_tsv_columns: 5120 5121 # If header column is enable 5122 if header_column not in fields_to_avoid: 5123 5124 # Header info type 5125 header_info_type = "String" 5126 header_column_df = output_results_tsv_df[header_column] 5127 header_column_df_dtype = header_column_df.dtype 5128 if header_column_df_dtype == object: 5129 if ( 5130 pd.to_numeric(header_column_df, errors="coerce") 5131 .notnull() 5132 .all() 5133 ): 5134 header_info_type = "Float" 5135 else: 5136 header_info_type = "Integer" 5137 5138 # Header info 5139 characters_to_validate = ["-"] 5140 pattern = "[" + "".join(characters_to_validate) + "]" 5141 header_info_name = re.sub( 5142 pattern, 5143 "_", 5144 f"Exomiser_{header_column}".replace("#", ""), 5145 ) 5146 header_info_number = "." 5147 header_info_description = ( 5148 f"Exomiser {header_column} annotation" 5149 ) 5150 header_info_source = "Exomiser" 5151 header_info_version = "unknown" 5152 header_info_code = CODE_TYPE_MAP[header_info_type] 5153 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5154 header_info_name, 5155 header_info_number, 5156 header_info_type, 5157 header_info_description, 5158 header_info_source, 5159 header_info_version, 5160 header_info_code, 5161 ) 5162 5163 # Add field to add for update to concat fields 5164 sql_query_update_concat_fields.append( 5165 f""" 5166 CASE 5167 WHEN table_parquet."{header_column}" NOT IN ('','.') 5168 THEN concat( 5169 '{header_info_name}=', 5170 table_parquet."{header_column}", 5171 ';' 5172 ) 5173 5174 ELSE '' 5175 END 5176 """ 5177 ) 5178 5179 # Update query 5180 sql_query_update = f""" 5181 UPDATE {table_variants} as table_variants 5182 SET INFO = concat( 5183 CASE 5184 WHEN INFO NOT IN ('', '.') 5185 THEN INFO 5186 ELSE '' 5187 END, 5188 CASE 5189 WHEN table_variants.INFO NOT IN ('','.') 5190 THEN ';' 5191 ELSE '' 5192 END, 5193 ( 5194 SELECT 5195 concat( 5196 {",".join(sql_query_update_concat_fields)} 5197 ) 5198 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5199 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5200 AND table_parquet.\"START\" = table_variants.\"POS\" 5201 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5202 AND table_parquet.\"REF\" = table_variants.\"REF\" 5203 ) 5204 ) 5205 ; 5206 """ 5207 5208 # Update 5209 self.conn.execute(sql_query_update) 5210 5211 ### Annotate with VCF INFO field ### 5212 5213 # Init result VCF file 5214 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5215 5216 # If VCF exists 5217 if os.path.exists(output_results_vcf): 5218 5219 # Log 5220 log.debug("Exomiser result VCF update variants") 5221 5222 # Find Exomiser INFO field annotation in header 5223 with gzip.open(output_results_vcf, "rt") as f: 5224 header_list = self.read_vcf_header(f) 5225 exomiser_vcf_header = vcf.Reader( 5226 io.StringIO("\n".join(header_list)) 5227 ) 5228 5229 # Add annotation INFO field to header 5230 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5231 5232 # Update variants with VCF 5233 self.update_from_vcf(output_results_vcf) 5234 5235 return True 5236 5237 def annotation_snpeff(self, threads: int = None) -> None: 5238 """ 5239 This function annotate with snpEff 5240 5241 :param threads: The number of threads to use 5242 :return: the value of the variable "return_value". 5243 """ 5244 5245 # DEBUG 5246 log.debug("Start annotation with snpeff databases") 5247 5248 # Threads 5249 if not threads: 5250 threads = self.get_threads() 5251 log.debug("Threads: " + str(threads)) 5252 5253 # DEBUG 5254 delete_tmp = True 5255 if self.get_config().get("verbosity", "warning") in ["debug"]: 5256 delete_tmp = False 5257 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5258 5259 # Config 5260 config = self.get_config() 5261 log.debug("Config: " + str(config)) 5262 5263 # Config - Folders - Databases 5264 databases_folders = ( 5265 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5266 ) 5267 log.debug("Databases annotations: " + str(databases_folders)) 5268 5269 # Config - snpEff bin command 5270 snpeff_bin_command = get_bin_command( 5271 bin="snpEff.jar", 5272 tool="snpeff", 5273 bin_type="jar", 5274 config=config, 5275 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5276 ) 5277 if not snpeff_bin_command: 5278 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5279 log.error(msg_err) 5280 raise ValueError(msg_err) 5281 5282 # Config - snpEff databases 5283 snpeff_databases = ( 5284 config.get("folders", {}) 5285 .get("databases", {}) 5286 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5287 ) 5288 snpeff_databases = full_path(snpeff_databases) 5289 if snpeff_databases is not None and snpeff_databases != "": 5290 log.debug(f"Create snpEff databases folder") 5291 if not os.path.exists(snpeff_databases): 5292 os.makedirs(snpeff_databases) 5293 5294 # Param 5295 param = self.get_param() 5296 log.debug("Param: " + str(param)) 5297 5298 # Param 5299 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5300 log.debug("Options: " + str(options)) 5301 5302 # Param - Assembly 5303 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5304 5305 # Param - Options 5306 snpeff_options = ( 5307 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5308 ) 5309 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5310 snpeff_csvstats = ( 5311 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5312 ) 5313 if snpeff_stats: 5314 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5315 snpeff_stats = full_path(snpeff_stats) 5316 snpeff_options += f" -stats {snpeff_stats}" 5317 if snpeff_csvstats: 5318 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5319 snpeff_csvstats = full_path(snpeff_csvstats) 5320 snpeff_options += f" -csvStats {snpeff_csvstats}" 5321 5322 # Data 5323 table_variants = self.get_table_variants() 5324 5325 # Check if not empty 5326 log.debug("Check if not empty") 5327 sql_query_chromosomes = ( 5328 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5329 ) 5330 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5331 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5332 log.info(f"VCF empty") 5333 return 5334 5335 # Export in VCF 5336 log.debug("Create initial file to annotate") 5337 tmp_vcf = NamedTemporaryFile( 5338 prefix=self.get_prefix(), 5339 dir=self.get_tmp_dir(), 5340 suffix=".vcf.gz", 5341 delete=True, 5342 ) 5343 tmp_vcf_name = tmp_vcf.name 5344 5345 # VCF header 5346 vcf_reader = self.get_header() 5347 log.debug("Initial header: " + str(vcf_reader.infos)) 5348 5349 # Existing annotations 5350 for vcf_annotation in self.get_header().infos: 5351 5352 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5353 log.debug( 5354 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5355 ) 5356 5357 # Memory limit 5358 # if config.get("memory", None): 5359 # memory_limit = config.get("memory", "8G") 5360 # else: 5361 # memory_limit = "8G" 5362 memory_limit = self.get_memory("8G") 5363 log.debug(f"memory_limit: {memory_limit}") 5364 5365 # snpEff java options 5366 snpeff_java_options = ( 5367 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5368 ) 5369 log.debug(f"Exomiser java options: {snpeff_java_options}") 5370 5371 force_update_annotation = True 5372 5373 if "ANN" not in self.get_header().infos or force_update_annotation: 5374 5375 # Check snpEff database 5376 log.debug(f"Check snpEff databases {[assembly]}") 5377 databases_download_snpeff( 5378 folder=snpeff_databases, assemblies=[assembly], config=config 5379 ) 5380 5381 # Export VCF file 5382 self.export_variant_vcf( 5383 vcf_file=tmp_vcf_name, 5384 remove_info=True, 5385 add_samples=False, 5386 index=True, 5387 ) 5388 5389 # Tmp file 5390 err_files = [] 5391 tmp_annotate_vcf = NamedTemporaryFile( 5392 prefix=self.get_prefix(), 5393 dir=self.get_tmp_dir(), 5394 suffix=".vcf", 5395 delete=False, 5396 ) 5397 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5398 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5399 err_files.append(tmp_annotate_vcf_name_err) 5400 5401 # Command 5402 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5403 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5404 run_parallel_commands([snpeff_command], 1) 5405 5406 # Error messages 5407 log.info(f"Error/Warning messages:") 5408 error_message_command_all = [] 5409 error_message_command_warning = [] 5410 error_message_command_err = [] 5411 for err_file in err_files: 5412 with open(err_file, "r") as f: 5413 for line in f: 5414 message = line.strip() 5415 error_message_command_all.append(message) 5416 if line.startswith("[W::"): 5417 error_message_command_warning.append(message) 5418 if line.startswith("[E::"): 5419 error_message_command_err.append(f"{err_file}: " + message) 5420 # log info 5421 for message in list( 5422 set(error_message_command_err + error_message_command_warning) 5423 ): 5424 log.info(f" {message}") 5425 # debug info 5426 for message in list(set(error_message_command_all)): 5427 log.debug(f" {message}") 5428 # failed 5429 if len(error_message_command_err): 5430 log.error("Annotation failed: Error in commands") 5431 raise ValueError("Annotation failed: Error in commands") 5432 5433 # Find annotation in header 5434 with open(tmp_annotate_vcf_name, "rt") as f: 5435 header_list = self.read_vcf_header(f) 5436 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5437 5438 for ann in annovar_vcf_header.infos: 5439 if ann not in self.get_header().infos: 5440 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5441 5442 # Update variants 5443 log.info(f"Annotation - Updating...") 5444 self.update_from_vcf(tmp_annotate_vcf_name) 5445 5446 else: 5447 if "ANN" in self.get_header().infos: 5448 log.debug(f"Existing snpEff annotations in VCF") 5449 if force_update_annotation: 5450 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5451 5452 def annotation_annovar(self, threads: int = None) -> None: 5453 """ 5454 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5455 annotations 5456 5457 :param threads: number of threads to use 5458 :return: the value of the variable "return_value". 5459 """ 5460 5461 # DEBUG 5462 log.debug("Start annotation with Annovar databases") 5463 5464 # Threads 5465 if not threads: 5466 threads = self.get_threads() 5467 log.debug("Threads: " + str(threads)) 5468 5469 # Tmp en Err files 5470 tmp_files = [] 5471 err_files = [] 5472 5473 # DEBUG 5474 delete_tmp = True 5475 if self.get_config().get("verbosity", "warning") in ["debug"]: 5476 delete_tmp = False 5477 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5478 5479 # Config 5480 config = self.get_config() 5481 log.debug("Config: " + str(config)) 5482 5483 # Config - Folders - Databases 5484 databases_folders = ( 5485 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5486 ) 5487 log.debug("Databases annotations: " + str(databases_folders)) 5488 5489 # Config - annovar bin command 5490 annovar_bin_command = get_bin_command( 5491 bin="table_annovar.pl", 5492 tool="annovar", 5493 bin_type="perl", 5494 config=config, 5495 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5496 ) 5497 if not annovar_bin_command: 5498 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5499 log.error(msg_err) 5500 raise ValueError(msg_err) 5501 5502 # Config - BCFTools bin command 5503 bcftools_bin_command = get_bin_command( 5504 bin="bcftools", 5505 tool="bcftools", 5506 bin_type="bin", 5507 config=config, 5508 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5509 ) 5510 if not bcftools_bin_command: 5511 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5512 log.error(msg_err) 5513 raise ValueError(msg_err) 5514 5515 # Config - annovar databases 5516 annovar_databases = ( 5517 config.get("folders", {}) 5518 .get("databases", {}) 5519 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5520 ) 5521 if annovar_databases is not None: 5522 if isinstance(annovar_databases, list): 5523 annovar_databases = full_path(annovar_databases[0]) 5524 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5525 annovar_databases = full_path(annovar_databases) 5526 if not os.path.exists(annovar_databases): 5527 log.info(f"Annovar databases folder '{annovar_databases}' created") 5528 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5529 else: 5530 msg_err = f"Annovar databases configuration failed" 5531 log.error(msg_err) 5532 raise ValueError(msg_err) 5533 5534 # Param 5535 param = self.get_param() 5536 log.debug("Param: " + str(param)) 5537 5538 # Param - options 5539 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5540 log.debug("Options: " + str(options)) 5541 5542 # Param - annotations 5543 annotations = ( 5544 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5545 ) 5546 log.debug("Annotations: " + str(annotations)) 5547 5548 # Param - Assembly 5549 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5550 5551 # Annovar database assembly 5552 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5553 if annovar_databases_assembly != "" and not os.path.exists( 5554 annovar_databases_assembly 5555 ): 5556 os.makedirs(annovar_databases_assembly) 5557 5558 # Data 5559 table_variants = self.get_table_variants() 5560 5561 # Check if not empty 5562 log.debug("Check if not empty") 5563 sql_query_chromosomes = ( 5564 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5565 ) 5566 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5567 if not sql_query_chromosomes_df["count"][0]: 5568 log.info(f"VCF empty") 5569 return 5570 5571 # VCF header 5572 vcf_reader = self.get_header() 5573 log.debug("Initial header: " + str(vcf_reader.infos)) 5574 5575 # Existing annotations 5576 for vcf_annotation in self.get_header().infos: 5577 5578 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5579 log.debug( 5580 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5581 ) 5582 5583 force_update_annotation = True 5584 5585 if annotations: 5586 5587 commands = [] 5588 tmp_annotates_vcf_name_list = [] 5589 5590 # Export in VCF 5591 log.debug("Create initial file to annotate") 5592 tmp_vcf = NamedTemporaryFile( 5593 prefix=self.get_prefix(), 5594 dir=self.get_tmp_dir(), 5595 suffix=".vcf.gz", 5596 delete=False, 5597 ) 5598 tmp_vcf_name = tmp_vcf.name 5599 tmp_files.append(tmp_vcf_name) 5600 tmp_files.append(tmp_vcf_name + ".tbi") 5601 5602 # Export VCF file 5603 self.export_variant_vcf( 5604 vcf_file=tmp_vcf_name, 5605 remove_info=".", 5606 add_samples=False, 5607 index=True, 5608 ) 5609 5610 # Create file for field rename 5611 log.debug("Create file for field rename") 5612 tmp_rename = NamedTemporaryFile( 5613 prefix=self.get_prefix(), 5614 dir=self.get_tmp_dir(), 5615 suffix=".rename", 5616 delete=False, 5617 ) 5618 tmp_rename_name = tmp_rename.name 5619 tmp_files.append(tmp_rename_name) 5620 5621 # Check Annovar database 5622 log.debug( 5623 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5624 ) 5625 databases_download_annovar( 5626 folder=annovar_databases, 5627 files=list(annotations.keys()), 5628 assemblies=[assembly], 5629 ) 5630 5631 for annotation in annotations: 5632 annotation_fields = annotations[annotation] 5633 5634 if not annotation_fields: 5635 annotation_fields = {"INFO": None} 5636 5637 log.info(f"Annotations Annovar - database '{annotation}'") 5638 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5639 5640 # Tmp file for annovar 5641 err_files = [] 5642 tmp_annotate_vcf_directory = TemporaryDirectory( 5643 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5644 ) 5645 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5646 tmp_annotate_vcf_name_annovar = ( 5647 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5648 ) 5649 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5650 err_files.append(tmp_annotate_vcf_name_err) 5651 tmp_files.append(tmp_annotate_vcf_name_err) 5652 5653 # Tmp file final vcf annotated by annovar 5654 tmp_annotate_vcf = NamedTemporaryFile( 5655 prefix=self.get_prefix(), 5656 dir=self.get_tmp_dir(), 5657 suffix=".vcf.gz", 5658 delete=False, 5659 ) 5660 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5661 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5662 tmp_files.append(tmp_annotate_vcf_name) 5663 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5664 5665 # Number of fields 5666 annotation_list = [] 5667 annotation_renamed_list = [] 5668 5669 for annotation_field in annotation_fields: 5670 5671 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5672 annotation_fields_new_name = annotation_fields.get( 5673 annotation_field, annotation_field 5674 ) 5675 if not annotation_fields_new_name: 5676 annotation_fields_new_name = annotation_field 5677 5678 if ( 5679 force_update_annotation 5680 or annotation_fields_new_name not in self.get_header().infos 5681 ): 5682 annotation_list.append(annotation_field) 5683 annotation_renamed_list.append(annotation_fields_new_name) 5684 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5685 log.warning( 5686 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5687 ) 5688 5689 # Add rename info 5690 run_parallel_commands( 5691 [ 5692 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5693 ], 5694 1, 5695 ) 5696 5697 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5698 log.debug("annotation_list: " + str(annotation_list)) 5699 5700 # protocol 5701 protocol = annotation 5702 5703 # argument 5704 argument = "" 5705 5706 # operation 5707 operation = "f" 5708 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5709 "ensGene" 5710 ): 5711 operation = "g" 5712 if options.get("genebase", None): 5713 argument = f"""'{options.get("genebase","")}'""" 5714 elif annotation in ["cytoBand"]: 5715 operation = "r" 5716 5717 # argument option 5718 argument_option = "" 5719 if argument != "": 5720 argument_option = " --argument " + argument 5721 5722 # command options 5723 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5724 for option in options: 5725 if option not in ["genebase"]: 5726 command_options += f""" --{option}={options[option]}""" 5727 5728 # Command 5729 5730 # Command - Annovar 5731 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5732 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5733 5734 # Command - start pipe 5735 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5736 5737 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5738 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5739 5740 # Command - Special characters (refGene annotation) 5741 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5742 5743 # Command - Clean empty fields (with value ".") 5744 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5745 5746 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5747 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5748 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5749 # for ann in annotation_renamed_list: 5750 for ann in annotation_list: 5751 annovar_fields_to_keep.append(f"^INFO/{ann}") 5752 5753 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5754 5755 # Command - indexing 5756 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5757 5758 log.debug(f"Annotation - Annovar command: {command_annovar}") 5759 run_parallel_commands([command_annovar], 1) 5760 5761 # Error messages 5762 log.info(f"Error/Warning messages:") 5763 error_message_command_all = [] 5764 error_message_command_warning = [] 5765 error_message_command_err = [] 5766 for err_file in err_files: 5767 with open(err_file, "r") as f: 5768 for line in f: 5769 message = line.strip() 5770 error_message_command_all.append(message) 5771 if line.startswith("[W::") or line.startswith("WARNING"): 5772 error_message_command_warning.append(message) 5773 if line.startswith("[E::") or line.startswith("ERROR"): 5774 error_message_command_err.append( 5775 f"{err_file}: " + message 5776 ) 5777 # log info 5778 for message in list( 5779 set(error_message_command_err + error_message_command_warning) 5780 ): 5781 log.info(f" {message}") 5782 # debug info 5783 for message in list(set(error_message_command_all)): 5784 log.debug(f" {message}") 5785 # failed 5786 if len(error_message_command_err): 5787 log.error("Annotation failed: Error in commands") 5788 raise ValueError("Annotation failed: Error in commands") 5789 5790 if tmp_annotates_vcf_name_list: 5791 5792 # List of annotated files 5793 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5794 5795 # Tmp file 5796 tmp_annotate_vcf = NamedTemporaryFile( 5797 prefix=self.get_prefix(), 5798 dir=self.get_tmp_dir(), 5799 suffix=".vcf.gz", 5800 delete=False, 5801 ) 5802 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5803 tmp_files.append(tmp_annotate_vcf_name) 5804 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5805 err_files.append(tmp_annotate_vcf_name_err) 5806 tmp_files.append(tmp_annotate_vcf_name_err) 5807 5808 # Command merge 5809 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5810 log.info( 5811 f"Annotation Annovar - Annotation merging " 5812 + str(len(tmp_annotates_vcf_name_list)) 5813 + " annotated files" 5814 ) 5815 log.debug(f"Annotation - merge command: {merge_command}") 5816 run_parallel_commands([merge_command], 1) 5817 5818 # Find annotation in header 5819 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5820 header_list = self.read_vcf_header(f) 5821 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5822 5823 for ann in annovar_vcf_header.infos: 5824 if ann not in self.get_header().infos: 5825 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5826 5827 # Update variants 5828 log.info(f"Annotation Annovar - Updating...") 5829 self.update_from_vcf(tmp_annotate_vcf_name) 5830 5831 # Clean files 5832 # Tmp file remove command 5833 if True: 5834 tmp_files_remove_command = "" 5835 if tmp_files: 5836 tmp_files_remove_command = " ".join(tmp_files) 5837 clean_command = f" rm -f {tmp_files_remove_command} " 5838 log.debug(f"Annotation Annovar - Annotation cleaning ") 5839 log.debug(f"Annotation - cleaning command: {clean_command}") 5840 run_parallel_commands([clean_command], 1) 5841 5842 # Parquet 5843 def annotation_parquet(self, threads: int = None) -> None: 5844 """ 5845 It takes a VCF file, and annotates it with a parquet file 5846 5847 :param threads: number of threads to use for the annotation 5848 :return: the value of the variable "result". 5849 """ 5850 5851 # DEBUG 5852 log.debug("Start annotation with parquet databases") 5853 5854 # Threads 5855 if not threads: 5856 threads = self.get_threads() 5857 log.debug("Threads: " + str(threads)) 5858 5859 # DEBUG 5860 delete_tmp = True 5861 if self.get_config().get("verbosity", "warning") in ["debug"]: 5862 delete_tmp = False 5863 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5864 5865 # Config 5866 databases_folders = set( 5867 self.get_config() 5868 .get("folders", {}) 5869 .get("databases", {}) 5870 .get("annotations", ["."]) 5871 + self.get_config() 5872 .get("folders", {}) 5873 .get("databases", {}) 5874 .get("parquet", ["."]) 5875 ) 5876 log.debug("Databases annotations: " + str(databases_folders)) 5877 5878 # Param 5879 annotations = ( 5880 self.get_param() 5881 .get("annotation", {}) 5882 .get("parquet", {}) 5883 .get("annotations", None) 5884 ) 5885 log.debug("Annotations: " + str(annotations)) 5886 5887 # Assembly 5888 assembly = self.get_param().get( 5889 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5890 ) 5891 5892 # Force Update Annotation 5893 force_update_annotation = ( 5894 self.get_param() 5895 .get("annotation", {}) 5896 .get("options", {}) 5897 .get("annotations_update", False) 5898 ) 5899 log.debug(f"force_update_annotation={force_update_annotation}") 5900 force_append_annotation = ( 5901 self.get_param() 5902 .get("annotation", {}) 5903 .get("options", {}) 5904 .get("annotations_append", False) 5905 ) 5906 log.debug(f"force_append_annotation={force_append_annotation}") 5907 5908 # Data 5909 table_variants = self.get_table_variants() 5910 5911 # Check if not empty 5912 log.debug("Check if not empty") 5913 sql_query_chromosomes_df = self.get_query_to_df( 5914 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5915 ) 5916 if not sql_query_chromosomes_df["count"][0]: 5917 log.info(f"VCF empty") 5918 return 5919 5920 # VCF header 5921 vcf_reader = self.get_header() 5922 log.debug("Initial header: " + str(vcf_reader.infos)) 5923 5924 # Nb Variants POS 5925 log.debug("NB Variants Start") 5926 nb_variants = self.conn.execute( 5927 f"SELECT count(*) AS count FROM variants" 5928 ).fetchdf()["count"][0] 5929 log.debug("NB Variants Stop") 5930 5931 # Existing annotations 5932 for vcf_annotation in self.get_header().infos: 5933 5934 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5935 log.debug( 5936 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5937 ) 5938 5939 # Added columns 5940 added_columns = [] 5941 5942 # drop indexes 5943 log.debug(f"Drop indexes...") 5944 self.drop_indexes() 5945 5946 if annotations: 5947 5948 if "ALL" in annotations: 5949 5950 all_param = annotations.get("ALL", {}) 5951 all_param_formats = all_param.get("formats", None) 5952 all_param_releases = all_param.get("releases", None) 5953 5954 databases_infos_dict = self.scan_databases( 5955 database_formats=all_param_formats, 5956 database_releases=all_param_releases, 5957 ) 5958 for database_infos in databases_infos_dict.keys(): 5959 if database_infos not in annotations: 5960 annotations[database_infos] = {"INFO": None} 5961 5962 for annotation in annotations: 5963 5964 if annotation in ["ALL"]: 5965 continue 5966 5967 # Annotation Name 5968 annotation_name = os.path.basename(annotation) 5969 5970 # Annotation fields 5971 annotation_fields = annotations[annotation] 5972 if not annotation_fields: 5973 annotation_fields = {"INFO": None} 5974 5975 log.debug(f"Annotation '{annotation_name}'") 5976 log.debug( 5977 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5978 ) 5979 5980 # Create Database 5981 database = Database( 5982 database=annotation, 5983 databases_folders=databases_folders, 5984 assembly=assembly, 5985 ) 5986 5987 # Find files 5988 parquet_file = database.get_database() 5989 parquet_hdr_file = database.get_header_file() 5990 parquet_type = database.get_type() 5991 5992 # Check if files exists 5993 if not parquet_file or not parquet_hdr_file: 5994 msg_err_list = [] 5995 if not parquet_file: 5996 msg_err_list.append( 5997 f"Annotation failed: Annotation file not found" 5998 ) 5999 if parquet_file and not parquet_hdr_file: 6000 msg_err_list.append( 6001 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6002 ) 6003 6004 log.error(". ".join(msg_err_list)) 6005 raise ValueError(". ".join(msg_err_list)) 6006 else: 6007 # Get parquet connexion 6008 parquet_sql_attach = database.get_sql_database_attach( 6009 output="query" 6010 ) 6011 if parquet_sql_attach: 6012 self.conn.execute(parquet_sql_attach) 6013 parquet_file_link = database.get_sql_database_link() 6014 # Log 6015 log.debug( 6016 f"Annotation '{annotation_name}' - file: " 6017 + str(parquet_file) 6018 + " and " 6019 + str(parquet_hdr_file) 6020 ) 6021 6022 # Database full header columns 6023 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6024 parquet_hdr_file 6025 ) 6026 # Log 6027 log.debug( 6028 "Annotation database header columns : " 6029 + str(parquet_hdr_vcf_header_columns) 6030 ) 6031 6032 # Load header as VCF object 6033 parquet_hdr_vcf_header_infos = database.get_header().infos 6034 # Log 6035 log.debug( 6036 "Annotation database header: " 6037 + str(parquet_hdr_vcf_header_infos) 6038 ) 6039 6040 # Get extra infos 6041 parquet_columns = database.get_extra_columns() 6042 # Log 6043 log.debug("Annotation database Columns: " + str(parquet_columns)) 6044 6045 # Add extra columns if "ALL" in annotation_fields 6046 # if "ALL" in annotation_fields: 6047 # allow_add_extra_column = True 6048 if "ALL" in annotation_fields and database.get_extra_columns(): 6049 for extra_column in database.get_extra_columns(): 6050 if ( 6051 extra_column not in annotation_fields 6052 and extra_column.replace("INFO/", "") 6053 not in parquet_hdr_vcf_header_infos 6054 ): 6055 parquet_hdr_vcf_header_infos[extra_column] = ( 6056 vcf.parser._Info( 6057 extra_column, 6058 ".", 6059 "String", 6060 f"{extra_column} description", 6061 "unknown", 6062 "unknown", 6063 self.code_type_map["String"], 6064 ) 6065 ) 6066 6067 # For all fields in database 6068 annotation_fields_all = False 6069 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6070 annotation_fields_all = True 6071 annotation_fields = { 6072 key: key for key in parquet_hdr_vcf_header_infos 6073 } 6074 6075 log.debug( 6076 "Annotation database header - All annotations added: " 6077 + str(annotation_fields) 6078 ) 6079 6080 # Init 6081 6082 # List of annotation fields to use 6083 sql_query_annotation_update_info_sets = [] 6084 6085 # List of annotation to agregate 6086 sql_query_annotation_to_agregate = [] 6087 6088 # Number of fields 6089 nb_annotation_field = 0 6090 6091 # Annotation fields processed 6092 annotation_fields_processed = [] 6093 6094 # Columns mapping 6095 map_columns = database.map_columns( 6096 columns=annotation_fields, prefixes=["INFO/"] 6097 ) 6098 6099 # Query dict for fields to remove (update option) 6100 query_dict_remove = {} 6101 6102 # Fetch Anotation fields 6103 for annotation_field in annotation_fields: 6104 6105 # annotation_field_column 6106 annotation_field_column = map_columns.get( 6107 annotation_field, "INFO" 6108 ) 6109 6110 # field new name, if parametered 6111 annotation_fields_new_name = annotation_fields.get( 6112 annotation_field, annotation_field 6113 ) 6114 if not annotation_fields_new_name: 6115 annotation_fields_new_name = annotation_field 6116 6117 # To annotate 6118 # force_update_annotation = True 6119 # force_append_annotation = True 6120 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6121 if annotation_field in parquet_hdr_vcf_header_infos and ( 6122 force_update_annotation 6123 or force_append_annotation 6124 or ( 6125 annotation_fields_new_name 6126 not in self.get_header().infos 6127 ) 6128 ): 6129 6130 # Add field to annotation to process list 6131 annotation_fields_processed.append( 6132 annotation_fields_new_name 6133 ) 6134 6135 # explode infos for the field 6136 annotation_fields_new_name_info_msg = "" 6137 if ( 6138 force_update_annotation 6139 and annotation_fields_new_name 6140 in self.get_header().infos 6141 ): 6142 # Remove field from INFO 6143 query = f""" 6144 UPDATE {table_variants} as table_variants 6145 SET INFO = REGEXP_REPLACE( 6146 concat(table_variants.INFO,''), 6147 ';*{annotation_fields_new_name}=[^;]*', 6148 '' 6149 ) 6150 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6151 """ 6152 annotation_fields_new_name_info_msg = " [update]" 6153 query_dict_remove[ 6154 f"remove 'INFO/{annotation_fields_new_name}'" 6155 ] = query 6156 6157 # Sep between fields in INFO 6158 nb_annotation_field += 1 6159 if nb_annotation_field > 1: 6160 annotation_field_sep = ";" 6161 else: 6162 annotation_field_sep = "" 6163 6164 log.info( 6165 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6166 ) 6167 6168 # Add INFO field to header 6169 parquet_hdr_vcf_header_infos_number = ( 6170 parquet_hdr_vcf_header_infos[annotation_field].num 6171 or "." 6172 ) 6173 parquet_hdr_vcf_header_infos_type = ( 6174 parquet_hdr_vcf_header_infos[annotation_field].type 6175 or "String" 6176 ) 6177 parquet_hdr_vcf_header_infos_description = ( 6178 parquet_hdr_vcf_header_infos[annotation_field].desc 6179 or f"{annotation_field} description" 6180 ) 6181 parquet_hdr_vcf_header_infos_source = ( 6182 parquet_hdr_vcf_header_infos[annotation_field].source 6183 or "unknown" 6184 ) 6185 parquet_hdr_vcf_header_infos_version = ( 6186 parquet_hdr_vcf_header_infos[annotation_field].version 6187 or "unknown" 6188 ) 6189 6190 vcf_reader.infos[annotation_fields_new_name] = ( 6191 vcf.parser._Info( 6192 annotation_fields_new_name, 6193 parquet_hdr_vcf_header_infos_number, 6194 parquet_hdr_vcf_header_infos_type, 6195 parquet_hdr_vcf_header_infos_description, 6196 parquet_hdr_vcf_header_infos_source, 6197 parquet_hdr_vcf_header_infos_version, 6198 self.code_type_map[ 6199 parquet_hdr_vcf_header_infos_type 6200 ], 6201 ) 6202 ) 6203 6204 # Append 6205 if force_append_annotation: 6206 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6207 else: 6208 query_case_when_append = "" 6209 6210 # Annotation/Update query fields 6211 # Found in INFO column 6212 if ( 6213 annotation_field_column == "INFO" 6214 and "INFO" in parquet_hdr_vcf_header_columns 6215 ): 6216 sql_query_annotation_update_info_sets.append( 6217 f""" 6218 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6219 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6220 ELSE '' 6221 END 6222 """ 6223 ) 6224 # Found in a specific column 6225 else: 6226 sql_query_annotation_update_info_sets.append( 6227 f""" 6228 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6229 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6230 ELSE '' 6231 END 6232 """ 6233 ) 6234 sql_query_annotation_to_agregate.append( 6235 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6236 ) 6237 6238 # Not to annotate 6239 else: 6240 6241 if force_update_annotation: 6242 annotation_message = "forced" 6243 else: 6244 annotation_message = "skipped" 6245 6246 if annotation_field not in parquet_hdr_vcf_header_infos: 6247 log.warning( 6248 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6249 ) 6250 if annotation_fields_new_name in self.get_header().infos: 6251 log.warning( 6252 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6253 ) 6254 6255 # Check if ALL fields have to be annotated. Thus concat all INFO field 6256 # allow_annotation_full_info = True 6257 allow_annotation_full_info = not force_append_annotation 6258 6259 if parquet_type in ["regions"]: 6260 allow_annotation_full_info = False 6261 6262 if ( 6263 allow_annotation_full_info 6264 and nb_annotation_field == len(annotation_fields) 6265 and annotation_fields_all 6266 and ( 6267 "INFO" in parquet_hdr_vcf_header_columns 6268 and "INFO" in database.get_extra_columns() 6269 ) 6270 ): 6271 log.debug("Column INFO annotation enabled") 6272 sql_query_annotation_update_info_sets = [] 6273 sql_query_annotation_update_info_sets.append( 6274 f" table_parquet.INFO " 6275 ) 6276 6277 if sql_query_annotation_update_info_sets: 6278 6279 # Annotate 6280 log.info(f"Annotation '{annotation_name}' - Annotation...") 6281 6282 # Join query annotation update info sets for SQL 6283 sql_query_annotation_update_info_sets_sql = ",".join( 6284 sql_query_annotation_update_info_sets 6285 ) 6286 6287 # Check chromosomes list (and variants infos) 6288 sql_query_chromosomes = f""" 6289 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6290 FROM {table_variants} as table_variants 6291 GROUP BY table_variants."#CHROM" 6292 ORDER BY table_variants."#CHROM" 6293 """ 6294 sql_query_chromosomes_df = self.conn.execute( 6295 sql_query_chromosomes 6296 ).df() 6297 sql_query_chromosomes_dict = { 6298 entry["CHROM"]: { 6299 "count": entry["count_variants"], 6300 "min": entry["min_variants"], 6301 "max": entry["max_variants"], 6302 } 6303 for index, entry in sql_query_chromosomes_df.iterrows() 6304 } 6305 6306 # Init 6307 nb_of_query = 0 6308 nb_of_variant_annotated = 0 6309 query_dict = query_dict_remove 6310 6311 # for chrom in sql_query_chromosomes_df["CHROM"]: 6312 for chrom in sql_query_chromosomes_dict: 6313 6314 # Number of variant by chromosome 6315 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6316 chrom, {} 6317 ).get("count", 0) 6318 6319 log.debug( 6320 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6321 ) 6322 6323 # Annotation with regions database 6324 if parquet_type in ["regions"]: 6325 sql_query_annotation_from_clause = f""" 6326 FROM ( 6327 SELECT 6328 '{chrom}' AS \"#CHROM\", 6329 table_variants_from.\"POS\" AS \"POS\", 6330 {",".join(sql_query_annotation_to_agregate)} 6331 FROM {table_variants} as table_variants_from 6332 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6333 table_parquet_from."#CHROM" = '{chrom}' 6334 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6335 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6336 ) 6337 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6338 GROUP BY table_variants_from.\"POS\" 6339 ) 6340 as table_parquet 6341 """ 6342 6343 sql_query_annotation_where_clause = """ 6344 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6345 AND table_parquet.\"POS\" = table_variants.\"POS\" 6346 """ 6347 6348 # Annotation with variants database 6349 else: 6350 sql_query_annotation_from_clause = f""" 6351 FROM {parquet_file_link} as table_parquet 6352 """ 6353 sql_query_annotation_where_clause = f""" 6354 table_variants."#CHROM" = '{chrom}' 6355 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6356 AND table_parquet.\"POS\" = table_variants.\"POS\" 6357 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6358 AND table_parquet.\"REF\" = table_variants.\"REF\" 6359 """ 6360 6361 # Create update query 6362 sql_query_annotation_chrom_interval_pos = f""" 6363 UPDATE {table_variants} as table_variants 6364 SET INFO = 6365 concat( 6366 CASE WHEN table_variants.INFO NOT IN ('','.') 6367 THEN table_variants.INFO 6368 ELSE '' 6369 END 6370 , 6371 CASE WHEN table_variants.INFO NOT IN ('','.') 6372 AND ( 6373 concat({sql_query_annotation_update_info_sets_sql}) 6374 ) 6375 NOT IN ('','.') 6376 THEN ';' 6377 ELSE '' 6378 END 6379 , 6380 {sql_query_annotation_update_info_sets_sql} 6381 ) 6382 {sql_query_annotation_from_clause} 6383 WHERE {sql_query_annotation_where_clause} 6384 ; 6385 """ 6386 6387 # Add update query to dict 6388 query_dict[ 6389 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6390 ] = sql_query_annotation_chrom_interval_pos 6391 6392 nb_of_query = len(query_dict) 6393 num_query = 0 6394 6395 # SET max_expression_depth TO x 6396 self.conn.execute("SET max_expression_depth TO 10000") 6397 6398 for query_name in query_dict: 6399 query = query_dict[query_name] 6400 num_query += 1 6401 log.info( 6402 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6403 ) 6404 result = self.conn.execute(query) 6405 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6406 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6407 log.info( 6408 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6409 ) 6410 6411 log.info( 6412 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6413 ) 6414 6415 else: 6416 6417 log.info( 6418 f"Annotation '{annotation_name}' - No Annotations available" 6419 ) 6420 6421 log.debug("Final header: " + str(vcf_reader.infos)) 6422 6423 # Remove added columns 6424 for added_column in added_columns: 6425 self.drop_column(column=added_column) 6426 6427 def annotation_splice(self, threads: int = None) -> None: 6428 """ 6429 This function annotate with snpEff 6430 6431 :param threads: The number of threads to use 6432 :return: the value of the variable "return_value". 6433 """ 6434 6435 # DEBUG 6436 log.debug("Start annotation with splice tools") 6437 6438 # Threads 6439 if not threads: 6440 threads = self.get_threads() 6441 log.debug("Threads: " + str(threads)) 6442 6443 # DEBUG 6444 delete_tmp = True 6445 if self.get_config().get("verbosity", "warning") in ["debug"]: 6446 delete_tmp = False 6447 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6448 6449 # Config 6450 config = self.get_config() 6451 log.debug("Config: " + str(config)) 6452 splice_config = config.get("tools", {}).get("splice", {}) 6453 if not splice_config: 6454 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6455 msg_err = "No Splice tool config" 6456 raise ValueError(msg_err) 6457 log.debug(f"splice_config: {splice_config}") 6458 6459 # Config - Folders - Databases 6460 databases_folders = ( 6461 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6462 ) 6463 log.debug("Databases annotations: " + str(databases_folders)) 6464 6465 # Splice docker image 6466 splice_docker_image = splice_config.get("docker").get("image") 6467 6468 # Pull splice image if it's not already there 6469 if not check_docker_image_exists(splice_docker_image): 6470 log.warning( 6471 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6472 ) 6473 try: 6474 command(f"docker pull {splice_config.get('docker').get('image')}") 6475 except subprocess.CalledProcessError: 6476 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6477 log.error(msg_err) 6478 raise ValueError(msg_err) 6479 6480 # Config - splice databases 6481 splice_databases = ( 6482 config.get("folders", {}) 6483 .get("databases", {}) 6484 .get("splice", DEFAULT_SPLICE_FOLDER) 6485 ) 6486 splice_databases = full_path(splice_databases) 6487 6488 # Param 6489 param = self.get_param() 6490 log.debug("Param: " + str(param)) 6491 6492 # Param 6493 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6494 log.debug("Options: " + str(options)) 6495 6496 # Data 6497 table_variants = self.get_table_variants() 6498 6499 # Check if not empty 6500 log.debug("Check if not empty") 6501 sql_query_chromosomes = ( 6502 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6503 ) 6504 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6505 log.info("VCF empty") 6506 return None 6507 6508 # Export in VCF 6509 log.debug("Create initial file to annotate") 6510 6511 # Create output folder / work folder 6512 if options.get("output_folder", ""): 6513 output_folder = options.get("output_folder", "") 6514 if not os.path.exists(output_folder): 6515 Path(output_folder).mkdir(parents=True, exist_ok=True) 6516 else: 6517 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6518 if not os.path.exists(output_folder): 6519 Path(output_folder).mkdir(parents=True, exist_ok=True) 6520 6521 if options.get("workdir", ""): 6522 workdir = options.get("workdir", "") 6523 else: 6524 workdir = "/work" 6525 6526 # Create tmp VCF file 6527 tmp_vcf = NamedTemporaryFile( 6528 prefix=self.get_prefix(), 6529 dir=output_folder, 6530 suffix=".vcf", 6531 delete=False, 6532 ) 6533 tmp_vcf_name = tmp_vcf.name 6534 6535 # VCF header 6536 header = self.get_header() 6537 6538 # Existing annotations 6539 for vcf_annotation in self.get_header().infos: 6540 6541 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6542 log.debug( 6543 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6544 ) 6545 6546 # Memory limit 6547 if config.get("memory", None): 6548 memory_limit = config.get("memory", "8G").upper() 6549 # upper() 6550 else: 6551 memory_limit = "8G" 6552 log.debug(f"memory_limit: {memory_limit}") 6553 6554 # Check number of variants to annotate 6555 where_clause_regex_spliceai = r"SpliceAI_\w+" 6556 where_clause_regex_spip = r"SPiP_\w+" 6557 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6558 df_list_of_variants_to_annotate = self.get_query_to_df( 6559 query=f""" SELECT * FROM variants {where_clause} """ 6560 ) 6561 if len(df_list_of_variants_to_annotate) == 0: 6562 log.warning( 6563 f"No variants to annotate with splice. Variants probably already annotated with splice" 6564 ) 6565 return None 6566 else: 6567 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6568 6569 # Export VCF file 6570 self.export_variant_vcf( 6571 vcf_file=tmp_vcf_name, 6572 remove_info=True, 6573 add_samples=True, 6574 index=False, 6575 where_clause=where_clause, 6576 ) 6577 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6578 if any(value for value in splice_config.values() if value is None): 6579 log.warning("At least one splice config parameter is empty") 6580 # exit annotation_splice 6581 return None 6582 6583 # Params in splice nf 6584 def check_values(dico: dict): 6585 """ 6586 Ensure parameters for NF splice pipeline 6587 """ 6588 for key, val in dico.items(): 6589 if key == "genome": 6590 if any( 6591 assemb in options.get("genome", {}) 6592 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6593 ): 6594 yield f"--{key} hg19" 6595 elif any( 6596 assemb in options.get("genome", {}) 6597 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6598 ): 6599 yield f"--{key} hg38" 6600 elif ( 6601 (isinstance(val, str) and val) 6602 or isinstance(val, int) 6603 or isinstance(val, bool) 6604 ): 6605 yield f"--{key} {val}" 6606 6607 # Genome 6608 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6609 options["genome"] = genome 6610 # NF params 6611 nf_params = [] 6612 # Add options 6613 if options: 6614 log.debug(options) 6615 nf_params = list(check_values(options)) 6616 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6617 else: 6618 log.debug("No NF params provided") 6619 # Add threads 6620 if "threads" not in options.keys(): 6621 nf_params.append(f"--threads {threads}") 6622 # Genome path 6623 genome_path = find_genome( 6624 config.get("folders", {}) 6625 .get("databases", {}) 6626 .get("genomes", DEFAULT_GENOME_FOLDER), 6627 file=f"{genome}.fa", 6628 ) 6629 # Add genome path 6630 if not genome_path: 6631 raise ValueError( 6632 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6633 ) 6634 else: 6635 log.debug(f"Genome: {genome_path}") 6636 nf_params.append(f"--genome_path {genome_path}") 6637 6638 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6639 """ 6640 Setting up updated databases for SPiP and SpliceAI 6641 """ 6642 6643 try: 6644 6645 # SpliceAI assembly transcriptome 6646 spliceai_assembly = os.path.join( 6647 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6648 options.get("genome"), 6649 "transcriptome", 6650 ) 6651 spip_assembly = options.get("genome") 6652 6653 spip = find( 6654 f"transcriptome_{spip_assembly}.RData", 6655 config.get("folders", {}).get("databases", {}).get("spip", {}), 6656 ) 6657 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6658 log.debug(f"SPiP annotations: {spip}") 6659 log.debug(f"SpliceAI annotations: {spliceai}") 6660 if spip and spliceai: 6661 return [ 6662 f"--spip_transcriptome {spip}", 6663 f"--spliceai_transcriptome {spliceai}", 6664 ] 6665 else: 6666 log.warning( 6667 "Can't find splice databases in configuration, use annotations file from image" 6668 ) 6669 except TypeError: 6670 log.warning( 6671 "Can't find splice databases in configuration, use annotations file from image" 6672 ) 6673 return [] 6674 6675 # Add options, check if transcriptome option have already beend provided 6676 if ( 6677 "spip_transcriptome" not in nf_params 6678 and "spliceai_transcriptome" not in nf_params 6679 ): 6680 splice_reference = splice_annotations(options, config) 6681 if splice_reference: 6682 nf_params.extend(splice_reference) 6683 # nf_params.append(f"--output_folder {output_folder}") 6684 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6685 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6686 log.debug(cmd) 6687 splice_config["docker"]["command"] = cmd 6688 6689 # Ensure proxy is set 6690 proxy = [ 6691 f"-e {var}={os.getenv(var)}" 6692 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6693 if os.getenv(var) is not None 6694 ] 6695 docker_cmd = get_bin_command( 6696 tool="splice", 6697 bin_type="docker", 6698 config=config, 6699 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6700 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6701 ) 6702 # print(docker_cmd) 6703 # exit() 6704 # Docker debug 6705 # if splice_config.get("rm_container"): 6706 # rm_container = "--rm" 6707 # else: 6708 # rm_container = "" 6709 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6710 log.debug(docker_cmd) 6711 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6712 log.debug(res.stdout) 6713 if res.stderr: 6714 log.error(res.stderr) 6715 res.check_returncode() 6716 # Update variants 6717 log.info("Annotation - Updating...") 6718 # Test find output vcf 6719 log.debug( 6720 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6721 ) 6722 output_vcf = [] 6723 # Wrong folder to look in 6724 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6725 if ( 6726 files 6727 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6728 ): 6729 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6730 # log.debug(os.listdir(options.get("output_folder"))) 6731 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6732 if not output_vcf: 6733 log.debug( 6734 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6735 ) 6736 else: 6737 # Get new header from annotated vcf 6738 log.debug(f"Initial header: {len(header.infos)} fields") 6739 # Create new header with splice infos 6740 new_vcf = Variants(input=output_vcf[0]) 6741 new_vcf_header = new_vcf.get_header().infos 6742 for keys, infos in new_vcf_header.items(): 6743 if keys not in header.infos.keys(): 6744 header.infos[keys] = infos 6745 log.debug(f"New header: {len(header.infos)} fields") 6746 log.debug(f"Splice tmp output: {output_vcf[0]}") 6747 self.update_from_vcf(output_vcf[0]) 6748 6749 # Remove file 6750 remove_if_exists(output_vcf) 6751 6752 ### 6753 # Prioritization 6754 ### 6755 6756 def get_config_default(self, name: str) -> dict: 6757 """ 6758 The function `get_config_default` returns a dictionary containing default configurations for 6759 various calculations and prioritizations. 6760 6761 :param name: The `get_config_default` function returns a dictionary containing default 6762 configurations for different calculations and prioritizations. The `name` parameter is used to 6763 specify which specific configuration to retrieve from the dictionary 6764 :type name: str 6765 :return: The function `get_config_default` returns a dictionary containing default configuration 6766 settings for different calculations and prioritizations. The specific configuration settings are 6767 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6768 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6769 returned. If there is no match, an empty dictionary is returned. 6770 """ 6771 6772 config_default = { 6773 "calculations": { 6774 "variant_chr_pos_alt_ref": { 6775 "type": "sql", 6776 "name": "variant_chr_pos_alt_ref", 6777 "description": "Create a variant ID with chromosome, position, alt and ref", 6778 "available": False, 6779 "output_column_name": "variant_chr_pos_alt_ref", 6780 "output_column_type": "String", 6781 "output_column_description": "variant ID with chromosome, position, alt and ref", 6782 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6783 "operation_info": True, 6784 }, 6785 "VARTYPE": { 6786 "type": "sql", 6787 "name": "VARTYPE", 6788 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6789 "available": True, 6790 "table": "variants", 6791 "output_column_name": "VARTYPE", 6792 "output_column_type": "String", 6793 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6794 "operation_query": """ 6795 CASE 6796 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6797 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6798 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6799 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6800 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6801 ELSE 'UNDEFINED' 6802 END 6803 """, 6804 "info_fields": ["SVTYPE"], 6805 "operation_info": True, 6806 }, 6807 "snpeff_hgvs": { 6808 "type": "python", 6809 "name": "snpeff_hgvs", 6810 "description": "HGVS nomenclatures from snpEff annotation", 6811 "available": True, 6812 "function_name": "calculation_extract_snpeff_hgvs", 6813 "function_params": ["snpeff_hgvs", "ANN"], 6814 }, 6815 "snpeff_ann_explode": { 6816 "type": "python", 6817 "name": "snpeff_ann_explode", 6818 "description": "Explode snpEff annotations with uniquify values", 6819 "available": True, 6820 "function_name": "calculation_snpeff_ann_explode", 6821 "function_params": [False, "fields", "snpeff_", "ANN"], 6822 }, 6823 "snpeff_ann_explode_uniquify": { 6824 "type": "python", 6825 "name": "snpeff_ann_explode_uniquify", 6826 "description": "Explode snpEff annotations", 6827 "available": True, 6828 "function_name": "calculation_snpeff_ann_explode", 6829 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6830 }, 6831 "snpeff_ann_explode_json": { 6832 "type": "python", 6833 "name": "snpeff_ann_explode_json", 6834 "description": "Explode snpEff annotations in JSON format", 6835 "available": True, 6836 "function_name": "calculation_snpeff_ann_explode", 6837 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6838 }, 6839 "NOMEN": { 6840 "type": "python", 6841 "name": "NOMEN", 6842 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6843 "available": True, 6844 "function_name": "calculation_extract_nomen", 6845 "function_params": [], 6846 }, 6847 "RENAME_INFO_FIELDS": { 6848 "type": "python", 6849 "name": "RENAME_INFO_FIELDS", 6850 "description": "Rename or remove INFO/tags", 6851 "available": True, 6852 "function_name": "calculation_rename_info_fields", 6853 "function_params": [], 6854 }, 6855 "FINDBYPIPELINE": { 6856 "type": "python", 6857 "name": "FINDBYPIPELINE", 6858 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6859 "available": True, 6860 "function_name": "calculation_find_by_pipeline", 6861 "function_params": ["findbypipeline"], 6862 }, 6863 "FINDBYSAMPLE": { 6864 "type": "python", 6865 "name": "FINDBYSAMPLE", 6866 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6867 "available": True, 6868 "function_name": "calculation_find_by_pipeline", 6869 "function_params": ["findbysample"], 6870 }, 6871 "GENOTYPECONCORDANCE": { 6872 "type": "python", 6873 "name": "GENOTYPECONCORDANCE", 6874 "description": "Concordance of genotype for multi caller VCF", 6875 "available": True, 6876 "function_name": "calculation_genotype_concordance", 6877 "function_params": [], 6878 }, 6879 "BARCODE": { 6880 "type": "python", 6881 "name": "BARCODE", 6882 "description": "BARCODE as VaRank tool", 6883 "available": True, 6884 "function_name": "calculation_barcode", 6885 "function_params": [], 6886 }, 6887 "BARCODEFAMILY": { 6888 "type": "python", 6889 "name": "BARCODEFAMILY", 6890 "description": "BARCODEFAMILY as VaRank tool", 6891 "available": True, 6892 "function_name": "calculation_barcode_family", 6893 "function_params": ["BCF"], 6894 }, 6895 "TRIO": { 6896 "type": "python", 6897 "name": "TRIO", 6898 "description": "Inheritance for a trio family", 6899 "available": True, 6900 "function_name": "calculation_trio", 6901 "function_params": [], 6902 }, 6903 "VAF": { 6904 "type": "python", 6905 "name": "VAF", 6906 "description": "Variant Allele Frequency (VAF) harmonization", 6907 "available": True, 6908 "function_name": "calculation_vaf_normalization", 6909 "function_params": [], 6910 }, 6911 "VAF_stats": { 6912 "type": "python", 6913 "name": "VAF_stats", 6914 "description": "Variant Allele Frequency (VAF) statistics", 6915 "available": True, 6916 "function_name": "calculation_genotype_stats", 6917 "function_params": ["VAF"], 6918 }, 6919 "DP_stats": { 6920 "type": "python", 6921 "name": "DP_stats", 6922 "description": "Depth (DP) statistics", 6923 "available": True, 6924 "function_name": "calculation_genotype_stats", 6925 "function_params": ["DP"], 6926 }, 6927 "variant_id": { 6928 "type": "python", 6929 "name": "variant_id", 6930 "description": "Variant ID generated from variant position and type", 6931 "available": True, 6932 "function_name": "calculation_variant_id", 6933 "function_params": [], 6934 }, 6935 "transcripts_json": { 6936 "type": "python", 6937 "name": "transcripts_json", 6938 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6939 "available": True, 6940 "function_name": "calculation_transcripts_annotation", 6941 "function_params": ["transcripts_json", None], 6942 }, 6943 "transcripts_ann": { 6944 "type": "python", 6945 "name": "transcripts_ann", 6946 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6947 "available": True, 6948 "function_name": "calculation_transcripts_annotation", 6949 "function_params": [None, "transcripts_ann"], 6950 }, 6951 "transcripts_annotations": { 6952 "type": "python", 6953 "name": "transcripts_annotations", 6954 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6955 "available": True, 6956 "function_name": "calculation_transcripts_annotation", 6957 "function_params": [None, None], 6958 }, 6959 "transcripts_prioritization": { 6960 "type": "python", 6961 "name": "transcripts_prioritization", 6962 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6963 "available": True, 6964 "function_name": "calculation_transcripts_prioritization", 6965 "function_params": [], 6966 }, 6967 "transcripts_export": { 6968 "type": "python", 6969 "name": "transcripts_export", 6970 "description": "Export transcripts table/view as a file (using param.json)", 6971 "available": True, 6972 "function_name": "calculation_transcripts_export", 6973 "function_params": [], 6974 }, 6975 }, 6976 "prioritizations": { 6977 "default": { 6978 "ANN2": [ 6979 { 6980 "type": "contains", 6981 "value": "HIGH", 6982 "score": 5, 6983 "flag": "PASS", 6984 "comment": [ 6985 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6986 ], 6987 }, 6988 { 6989 "type": "contains", 6990 "value": "MODERATE", 6991 "score": 3, 6992 "flag": "PASS", 6993 "comment": [ 6994 "A non-disruptive variant that might change protein effectiveness" 6995 ], 6996 }, 6997 { 6998 "type": "contains", 6999 "value": "LOW", 7000 "score": 0, 7001 "flag": "FILTERED", 7002 "comment": [ 7003 "Assumed to be mostly harmless or unlikely to change protein behavior" 7004 ], 7005 }, 7006 { 7007 "type": "contains", 7008 "value": "MODIFIER", 7009 "score": 0, 7010 "flag": "FILTERED", 7011 "comment": [ 7012 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7013 ], 7014 }, 7015 ], 7016 } 7017 }, 7018 } 7019 7020 return config_default.get(name, None) 7021 7022 def get_config_json( 7023 self, name: str, config_dict: dict = {}, config_file: str = None 7024 ) -> dict: 7025 """ 7026 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7027 default values, a dictionary, and a file. 7028 7029 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7030 the name of the configuration. It is used to identify and retrieve the configuration settings 7031 for a specific component or module 7032 :type name: str 7033 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7034 dictionary that allows you to provide additional configuration settings or overrides. When you 7035 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7036 the key is the configuration setting you want to override or 7037 :type config_dict: dict 7038 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7039 specify the path to a configuration file that contains additional settings. If provided, the 7040 function will read the contents of this file and update the configuration dictionary with the 7041 values found in the file, overriding any existing values with the 7042 :type config_file: str 7043 :return: The function `get_config_json` returns a dictionary containing the configuration 7044 settings. 7045 """ 7046 7047 # Create with default prioritizations 7048 config_default = self.get_config_default(name=name) 7049 configuration = config_default 7050 # log.debug(f"configuration={configuration}") 7051 7052 # Replace prioritizations from dict 7053 for config in config_dict: 7054 configuration[config] = config_dict[config] 7055 7056 # Replace prioritizations from file 7057 config_file = full_path(config_file) 7058 if config_file: 7059 if os.path.exists(config_file): 7060 with open(config_file) as config_file_content: 7061 config_file_dict = yaml.safe_load(config_file_content) 7062 for config in config_file_dict: 7063 configuration[config] = config_file_dict[config] 7064 else: 7065 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7066 log.error(msg_error) 7067 raise ValueError(msg_error) 7068 7069 return configuration 7070 7071 def prioritization( 7072 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7073 ) -> bool: 7074 """ 7075 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7076 prioritizes variants based on configured profiles and criteria. 7077 7078 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7079 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7080 a table name is provided, the method will prioritize the variants in that specific table 7081 :type table: str 7082 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7083 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7084 provided, the code will use a default prefix value of "PZ" 7085 :type pz_prefix: str 7086 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7087 additional parameters specific to the prioritization process. These parameters can include 7088 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7089 configurations needed for the prioritization of variants in a V 7090 :type pz_param: dict 7091 :return: A boolean value (True) is being returned from the `prioritization` function. 7092 """ 7093 7094 # Config 7095 config = self.get_config() 7096 7097 # Param 7098 param = self.get_param() 7099 7100 # Prioritization param 7101 if pz_param is not None: 7102 prioritization_param = pz_param 7103 else: 7104 prioritization_param = param.get("prioritization", {}) 7105 7106 # Configuration profiles 7107 prioritization_config_file = prioritization_param.get( 7108 "prioritization_config", None 7109 ) 7110 prioritization_config_file = full_path(prioritization_config_file) 7111 prioritizations_config = self.get_config_json( 7112 name="prioritizations", config_file=prioritization_config_file 7113 ) 7114 7115 # Prioritization prefix 7116 pz_prefix_default = "PZ" 7117 if pz_prefix is None: 7118 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7119 7120 # Prioritization options 7121 profiles = prioritization_param.get("profiles", []) 7122 if isinstance(profiles, str): 7123 profiles = profiles.split(",") 7124 pzfields = prioritization_param.get( 7125 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7126 ) 7127 if isinstance(pzfields, str): 7128 pzfields = pzfields.split(",") 7129 default_profile = prioritization_param.get("default_profile", None) 7130 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7131 prioritization_score_mode = prioritization_param.get( 7132 "prioritization_score_mode", "HOWARD" 7133 ) 7134 7135 # Quick Prioritizations 7136 prioritizations = param.get("prioritizations", None) 7137 if prioritizations: 7138 log.info("Quick Prioritization:") 7139 for profile in prioritizations.split(","): 7140 if profile not in profiles: 7141 profiles.append(profile) 7142 log.info(f" {profile}") 7143 7144 # If profile "ALL" provided, all profiles in the config profiles 7145 if "ALL" in profiles: 7146 profiles = list(prioritizations_config.keys()) 7147 7148 for profile in profiles: 7149 if prioritizations_config.get(profile, None): 7150 log.debug(f"Profile '{profile}' configured") 7151 else: 7152 msg_error = f"Profile '{profile}' NOT configured" 7153 log.error(msg_error) 7154 raise ValueError(msg_error) 7155 7156 if profiles: 7157 log.info(f"Prioritization... ") 7158 else: 7159 log.debug(f"No profile defined") 7160 return False 7161 7162 if not default_profile and len(profiles): 7163 default_profile = profiles[0] 7164 7165 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7166 log.debug("Profiles to check: " + str(list(profiles))) 7167 7168 # Variables 7169 if table is not None: 7170 table_variants = table 7171 else: 7172 table_variants = self.get_table_variants(clause="update") 7173 log.debug(f"Table to prioritize: {table_variants}") 7174 7175 # Added columns 7176 added_columns = [] 7177 7178 # Create list of PZfields 7179 # List of PZFields 7180 list_of_pzfields_original = pzfields + [ 7181 pzfield + pzfields_sep + profile 7182 for pzfield in pzfields 7183 for profile in profiles 7184 ] 7185 list_of_pzfields = [] 7186 log.debug(f"{list_of_pzfields_original}") 7187 7188 # Remove existing PZfields to use if exists 7189 for pzfield in list_of_pzfields_original: 7190 if self.get_header().infos.get(pzfield, None) is None: 7191 list_of_pzfields.append(pzfield) 7192 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7193 else: 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7195 7196 if list_of_pzfields: 7197 7198 # Explode Infos prefix 7199 explode_infos_prefix = self.get_explode_infos_prefix() 7200 7201 # PZfields tags description 7202 PZfields_INFOS = { 7203 f"{pz_prefix}Tags": { 7204 "ID": f"{pz_prefix}Tags", 7205 "Number": ".", 7206 "Type": "String", 7207 "Description": "Variant tags based on annotation criteria", 7208 }, 7209 f"{pz_prefix}Score": { 7210 "ID": f"{pz_prefix}Score", 7211 "Number": 1, 7212 "Type": "Integer", 7213 "Description": "Variant score based on annotation criteria", 7214 }, 7215 f"{pz_prefix}Flag": { 7216 "ID": f"{pz_prefix}Flag", 7217 "Number": 1, 7218 "Type": "String", 7219 "Description": "Variant flag based on annotation criteria", 7220 }, 7221 f"{pz_prefix}Comment": { 7222 "ID": f"{pz_prefix}Comment", 7223 "Number": ".", 7224 "Type": "String", 7225 "Description": "Variant comment based on annotation criteria", 7226 }, 7227 f"{pz_prefix}Infos": { 7228 "ID": f"{pz_prefix}Infos", 7229 "Number": ".", 7230 "Type": "String", 7231 "Description": "Variant infos based on annotation criteria", 7232 }, 7233 f"{pz_prefix}Class": { 7234 "ID": f"{pz_prefix}Class", 7235 "Number": ".", 7236 "Type": "String", 7237 "Description": "Variant class based on annotation criteria", 7238 }, 7239 } 7240 7241 # Create INFO fields if not exist 7242 for field in PZfields_INFOS: 7243 field_ID = PZfields_INFOS[field]["ID"] 7244 field_description = PZfields_INFOS[field]["Description"] 7245 if field_ID not in self.get_header().infos and field_ID in pzfields: 7246 field_description = ( 7247 PZfields_INFOS[field]["Description"] 7248 + f", profile {default_profile}" 7249 ) 7250 self.get_header().infos[field_ID] = vcf.parser._Info( 7251 field_ID, 7252 PZfields_INFOS[field]["Number"], 7253 PZfields_INFOS[field]["Type"], 7254 field_description, 7255 "unknown", 7256 "unknown", 7257 code_type_map[PZfields_INFOS[field]["Type"]], 7258 ) 7259 7260 # Create INFO fields if not exist for each profile 7261 for profile in prioritizations_config: 7262 if profile in profiles or profiles == []: 7263 for field in PZfields_INFOS: 7264 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7265 field_description = ( 7266 PZfields_INFOS[field]["Description"] 7267 + f", profile {profile}" 7268 ) 7269 if ( 7270 field_ID not in self.get_header().infos 7271 and field in pzfields 7272 ): 7273 self.get_header().infos[field_ID] = vcf.parser._Info( 7274 field_ID, 7275 PZfields_INFOS[field]["Number"], 7276 PZfields_INFOS[field]["Type"], 7277 field_description, 7278 "unknown", 7279 "unknown", 7280 code_type_map[PZfields_INFOS[field]["Type"]], 7281 ) 7282 7283 # Header 7284 for pzfield in list_of_pzfields: 7285 if re.match(f"{pz_prefix}Score.*", pzfield): 7286 added_column = self.add_column( 7287 table_name=table_variants, 7288 column_name=pzfield, 7289 column_type="INTEGER", 7290 default_value="0", 7291 ) 7292 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7293 added_column = self.add_column( 7294 table_name=table_variants, 7295 column_name=pzfield, 7296 column_type="BOOLEAN", 7297 default_value="1", 7298 ) 7299 elif re.match(f"{pz_prefix}Class.*", pzfield): 7300 added_column = self.add_column( 7301 table_name=table_variants, 7302 column_name=pzfield, 7303 column_type="VARCHAR[]", 7304 default_value="null", 7305 ) 7306 else: 7307 added_column = self.add_column( 7308 table_name=table_variants, 7309 column_name=pzfield, 7310 column_type="STRING", 7311 default_value="''", 7312 ) 7313 added_columns.append(added_column) 7314 7315 # Profiles 7316 if profiles: 7317 7318 # foreach profile in configuration file 7319 for profile in prioritizations_config: 7320 7321 # If profile is asked in param, or ALL are asked (empty profile []) 7322 if profile in profiles or profiles == []: 7323 log.info(f"Profile '{profile}'") 7324 7325 sql_set_info_option = "" 7326 7327 sql_set_info = [] 7328 7329 # PZ fields set 7330 7331 # PZScore 7332 if ( 7333 f"{pz_prefix}Score{pzfields_sep}{profile}" 7334 in list_of_pzfields 7335 ): 7336 sql_set_info.append( 7337 f""" 7338 concat( 7339 '{pz_prefix}Score{pzfields_sep}{profile}=', 7340 {pz_prefix}Score{pzfields_sep}{profile} 7341 ) 7342 """ 7343 ) 7344 if ( 7345 profile == default_profile 7346 and f"{pz_prefix}Score" in list_of_pzfields 7347 ): 7348 sql_set_info.append( 7349 f""" 7350 concat( 7351 '{pz_prefix}Score=', 7352 {pz_prefix}Score{pzfields_sep}{profile} 7353 ) 7354 """ 7355 ) 7356 7357 # PZFlag 7358 if ( 7359 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7360 in list_of_pzfields 7361 ): 7362 sql_set_info.append( 7363 f""" 7364 concat( 7365 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7366 CASE 7367 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7368 THEN 'PASS' 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7370 THEN 'FILTERED' 7371 END 7372 ) 7373 """ 7374 ) 7375 if ( 7376 profile == default_profile 7377 and f"{pz_prefix}Flag" in list_of_pzfields 7378 ): 7379 sql_set_info.append( 7380 f""" 7381 concat( 7382 '{pz_prefix}Flag=', 7383 CASE 7384 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7385 THEN 'PASS' 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7387 THEN 'FILTERED' 7388 END 7389 ) 7390 """ 7391 ) 7392 7393 # PZClass 7394 if ( 7395 f"{pz_prefix}Class{pzfields_sep}{profile}" 7396 in list_of_pzfields 7397 ): 7398 sql_set_info.append( 7399 f""" 7400 concat( 7401 '{pz_prefix}Class{pzfields_sep}{profile}=', 7402 CASE 7403 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7404 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7405 ELSE '.' 7406 END 7407 ) 7408 7409 """ 7410 ) 7411 if ( 7412 profile == default_profile 7413 and f"{pz_prefix}Class" in list_of_pzfields 7414 ): 7415 sql_set_info.append( 7416 f""" 7417 concat( 7418 '{pz_prefix}Class=', 7419 CASE 7420 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7421 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7422 ELSE '.' 7423 END 7424 ) 7425 """ 7426 ) 7427 7428 # PZComment 7429 if ( 7430 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7431 in list_of_pzfields 7432 ): 7433 sql_set_info.append( 7434 f""" 7435 CASE 7436 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7437 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7438 ELSE '' 7439 END 7440 """ 7441 ) 7442 if ( 7443 profile == default_profile 7444 and f"{pz_prefix}Comment" in list_of_pzfields 7445 ): 7446 sql_set_info.append( 7447 f""" 7448 CASE 7449 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7450 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7451 ELSE '' 7452 END 7453 """ 7454 ) 7455 7456 # PZInfos 7457 if ( 7458 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7459 in list_of_pzfields 7460 ): 7461 sql_set_info.append( 7462 f""" 7463 CASE 7464 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7465 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7466 ELSE '' 7467 END 7468 """ 7469 ) 7470 if ( 7471 profile == default_profile 7472 and f"{pz_prefix}Infos" in list_of_pzfields 7473 ): 7474 sql_set_info.append( 7475 f""" 7476 CASE 7477 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7478 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7479 ELSE '' 7480 END 7481 """ 7482 ) 7483 7484 # Merge PZfields 7485 sql_set_info_option = "" 7486 sql_set_sep = "" 7487 for sql_set in sql_set_info: 7488 if sql_set_sep: 7489 sql_set_info_option += f""" 7490 , concat('{sql_set_sep}', {sql_set}) 7491 """ 7492 else: 7493 sql_set_info_option += f""" 7494 , {sql_set} 7495 """ 7496 sql_set_sep = ";" 7497 7498 sql_queries = [] 7499 for annotation in prioritizations_config[profile]: 7500 7501 # skip special sections 7502 if annotation.startswith("_"): 7503 continue 7504 7505 # For each criterions 7506 for criterion in prioritizations_config[profile][ 7507 annotation 7508 ]: 7509 7510 # Criterion mode 7511 criterion_mode = None 7512 if np.any( 7513 np.isin(list(criterion.keys()), ["type", "value"]) 7514 ): 7515 criterion_mode = "operation" 7516 elif np.any( 7517 np.isin(list(criterion.keys()), ["sql", "fields"]) 7518 ): 7519 criterion_mode = "sql" 7520 log.debug(f"Criterion Mode: {criterion_mode}") 7521 7522 # Criterion parameters 7523 criterion_type = criterion.get("type", None) 7524 criterion_value = criterion.get("value", None) 7525 criterion_sql = criterion.get("sql", None) 7526 criterion_fields = criterion.get("fields", None) 7527 criterion_score = criterion.get("score", 0) 7528 criterion_flag = criterion.get("flag", "PASS") 7529 criterion_class = criterion.get("class", None) 7530 criterion_flag_bool = criterion_flag == "PASS" 7531 criterion_comment = ( 7532 ", ".join(criterion.get("comment", [])) 7533 .replace("'", "''") 7534 .replace(";", ",") 7535 .replace("\t", " ") 7536 ) 7537 criterion_infos = ( 7538 str(criterion) 7539 .replace("'", "''") 7540 .replace(";", ",") 7541 .replace("\t", " ") 7542 ) 7543 7544 # SQL 7545 if criterion_sql is not None and isinstance( 7546 criterion_sql, list 7547 ): 7548 criterion_sql = " ".join(criterion_sql) 7549 7550 # Fields and explode 7551 if criterion_fields is None: 7552 criterion_fields = [annotation] 7553 if not isinstance(criterion_fields, list): 7554 criterion_fields = str(criterion_fields).split(",") 7555 7556 # Class 7557 if criterion_class is not None and not isinstance( 7558 criterion_class, list 7559 ): 7560 criterion_class = str(criterion_class).split(",") 7561 7562 for annotation_field in criterion_fields: 7563 7564 # Explode specific annotation 7565 log.debug( 7566 f"Explode annotation '{annotation_field}'" 7567 ) 7568 added_columns += self.explode_infos( 7569 prefix=explode_infos_prefix, 7570 fields=[annotation_field], 7571 table=table_variants, 7572 ) 7573 extra_infos = self.get_extra_infos( 7574 table=table_variants 7575 ) 7576 7577 # Check if annotation field is present 7578 if ( 7579 f"{explode_infos_prefix}{annotation_field}" 7580 not in extra_infos 7581 ): 7582 msq_err = f"Annotation '{annotation_field}' not in data" 7583 log.error(msq_err) 7584 raise ValueError(msq_err) 7585 else: 7586 log.debug( 7587 f"Annotation '{annotation_field}' in data" 7588 ) 7589 7590 sql_set = [] 7591 sql_set_info = [] 7592 7593 # PZ fields set 7594 7595 # PZScore 7596 if ( 7597 f"{pz_prefix}Score{pzfields_sep}{profile}" 7598 in list_of_pzfields 7599 ): 7600 # VaRank prioritization score mode 7601 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7602 sql_set.append( 7603 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7604 ) 7605 # default HOWARD prioritization score mode 7606 else: 7607 sql_set.append( 7608 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7609 ) 7610 7611 # PZFlag 7612 if ( 7613 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 ): 7616 sql_set.append( 7617 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7618 ) 7619 7620 # PZClass 7621 if ( 7622 f"{pz_prefix}Class{pzfields_sep}{profile}" 7623 in list_of_pzfields 7624 and criterion_class is not None 7625 ): 7626 sql_set.append( 7627 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7628 ) 7629 7630 # PZComment 7631 if ( 7632 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7633 in list_of_pzfields 7634 ): 7635 sql_set.append( 7636 f""" 7637 {pz_prefix}Comment{pzfields_sep}{profile} = 7638 concat( 7639 {pz_prefix}Comment{pzfields_sep}{profile}, 7640 CASE 7641 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7642 THEN ', ' 7643 ELSE '' 7644 END, 7645 '{criterion_comment}' 7646 ) 7647 """ 7648 ) 7649 7650 # PZInfos 7651 if ( 7652 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7653 in list_of_pzfields 7654 ): 7655 sql_set.append( 7656 f""" 7657 {pz_prefix}Infos{pzfields_sep}{profile} = 7658 concat( 7659 {pz_prefix}Infos{pzfields_sep}{profile}, 7660 '{criterion_infos}' 7661 ) 7662 """ 7663 ) 7664 sql_set_option = ",".join(sql_set) 7665 7666 # Criterion and comparison 7667 if sql_set_option: 7668 7669 if criterion_mode in ["operation"]: 7670 7671 try: 7672 float(criterion_value) 7673 sql_update = f""" 7674 UPDATE {table_variants} 7675 SET {sql_set_option} 7676 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7677 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7678 """ 7679 except: 7680 contains_option = "" 7681 if criterion_type == "contains": 7682 contains_option = ".*" 7683 sql_update = f""" 7684 UPDATE {table_variants} 7685 SET {sql_set_option} 7686 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7687 """ 7688 sql_queries.append(sql_update) 7689 7690 elif criterion_mode in ["sql"]: 7691 7692 sql_update = f""" 7693 UPDATE {table_variants} 7694 SET {sql_set_option} 7695 WHERE {criterion_sql} 7696 """ 7697 sql_queries.append(sql_update) 7698 7699 else: 7700 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7701 log.error(msg_err) 7702 raise ValueError(msg_err) 7703 7704 else: 7705 log.warning( 7706 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7707 ) 7708 7709 # PZTags 7710 if ( 7711 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7712 in list_of_pzfields 7713 ): 7714 7715 # Create PZFalgs value 7716 pztags_value = "" 7717 pztags_sep_default = "," 7718 pztags_sep = "" 7719 for pzfield in pzfields: 7720 if pzfield not in [f"{pz_prefix}Tags"]: 7721 if ( 7722 f"{pzfield}{pzfields_sep}{profile}" 7723 in list_of_pzfields 7724 ): 7725 if pzfield in [f"{pz_prefix}Flag"]: 7726 pztags_value += f"""{pztags_sep}{pzfield}#', 7727 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7728 THEN 'PASS' 7729 ELSE 'FILTERED' 7730 END, '""" 7731 elif pzfield in [f"{pz_prefix}Class"]: 7732 pztags_value += f"""{pztags_sep}{pzfield}#', 7733 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7734 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7735 ELSE '.' 7736 END, '""" 7737 else: 7738 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7739 pztags_sep = pztags_sep_default 7740 7741 # Add Query update for PZFlags 7742 sql_update_pztags = f""" 7743 UPDATE {table_variants} 7744 SET INFO = concat( 7745 INFO, 7746 CASE WHEN INFO NOT in ('','.') 7747 THEN ';' 7748 ELSE '' 7749 END, 7750 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7751 ) 7752 """ 7753 sql_queries.append(sql_update_pztags) 7754 7755 # Add Query update for PZFlags for default 7756 if profile == default_profile: 7757 sql_update_pztags_default = f""" 7758 UPDATE {table_variants} 7759 SET INFO = concat( 7760 INFO, 7761 ';', 7762 '{pz_prefix}Tags={pztags_value}' 7763 ) 7764 """ 7765 sql_queries.append(sql_update_pztags_default) 7766 7767 log.info(f"""Profile '{profile}' - Prioritization... """) 7768 7769 if sql_queries: 7770 7771 for sql_query in sql_queries: 7772 log.debug( 7773 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7774 ) 7775 self.conn.execute(sql_query) 7776 7777 log.info(f"""Profile '{profile}' - Update... """) 7778 sql_query_update = f""" 7779 UPDATE {table_variants} 7780 SET INFO = 7781 concat( 7782 CASE 7783 WHEN INFO NOT IN ('','.') 7784 THEN concat(INFO, ';') 7785 ELSE '' 7786 END 7787 {sql_set_info_option} 7788 ) 7789 """ 7790 self.conn.execute(sql_query_update) 7791 7792 else: 7793 7794 log.warning(f"No profiles in parameters") 7795 7796 # Remove added columns 7797 for added_column in added_columns: 7798 self.drop_column(column=added_column) 7799 7800 # Explode INFOS fields into table fields 7801 if self.get_explode_infos(): 7802 self.explode_infos( 7803 prefix=self.get_explode_infos_prefix(), 7804 fields=self.get_explode_infos_fields(), 7805 force=True, 7806 ) 7807 7808 return True 7809 7810 ### 7811 # HGVS 7812 ### 7813 7814 def annotation_hgvs(self, threads: int = None) -> None: 7815 """ 7816 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7817 coordinates and alleles. 7818 7819 :param threads: The `threads` parameter is an optional integer that specifies the number of 7820 threads to use for parallel processing. If no value is provided, it will default to the number 7821 of threads obtained from the `get_threads()` method 7822 :type threads: int 7823 """ 7824 7825 # Function for each partition of the Dask Dataframe 7826 def partition_function(partition): 7827 """ 7828 The function `partition_function` applies the `annotation_hgvs_partition` function to 7829 each row of a DataFrame called `partition`. 7830 7831 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7832 to be processed 7833 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7834 the "partition" dataframe along the axis 1. 7835 """ 7836 return partition.apply(annotation_hgvs_partition, axis=1) 7837 7838 def annotation_hgvs_partition(row) -> str: 7839 """ 7840 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7841 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7842 7843 :param row: A dictionary-like object that contains the values for the following keys: 7844 :return: a string that contains the HGVS names associated with the given row of data. 7845 """ 7846 7847 chr = row["CHROM"] 7848 pos = row["POS"] 7849 ref = row["REF"] 7850 alt = row["ALT"] 7851 7852 # Find list of associated transcripts 7853 transcripts_list = list( 7854 polars_conn.execute( 7855 f""" 7856 SELECT transcript 7857 FROM refseq_df 7858 WHERE CHROM='{chr}' 7859 AND POS={pos} 7860 """ 7861 )["transcript"] 7862 ) 7863 7864 # Full HGVS annotation in list 7865 hgvs_full_list = [] 7866 7867 for transcript_name in transcripts_list: 7868 7869 # Transcript 7870 transcript = get_transcript( 7871 transcripts=transcripts, transcript_name=transcript_name 7872 ) 7873 # Exon 7874 if use_exon: 7875 exon = transcript.find_exon_number(pos) 7876 else: 7877 exon = None 7878 # Protein 7879 transcript_protein = None 7880 if use_protein or add_protein or full_format: 7881 transcripts_protein = list( 7882 polars_conn.execute( 7883 f""" 7884 SELECT protein 7885 FROM refseqlink_df 7886 WHERE transcript='{transcript_name}' 7887 LIMIT 1 7888 """ 7889 )["protein"] 7890 ) 7891 if len(transcripts_protein): 7892 transcript_protein = transcripts_protein[0] 7893 7894 # HGVS name 7895 hgvs_name = format_hgvs_name( 7896 chr, 7897 pos, 7898 ref, 7899 alt, 7900 genome=genome, 7901 transcript=transcript, 7902 transcript_protein=transcript_protein, 7903 exon=exon, 7904 use_gene=use_gene, 7905 use_protein=use_protein, 7906 full_format=full_format, 7907 use_version=use_version, 7908 codon_type=codon_type, 7909 ) 7910 hgvs_full_list.append(hgvs_name) 7911 if add_protein and not use_protein and not full_format: 7912 hgvs_name = format_hgvs_name( 7913 chr, 7914 pos, 7915 ref, 7916 alt, 7917 genome=genome, 7918 transcript=transcript, 7919 transcript_protein=transcript_protein, 7920 exon=exon, 7921 use_gene=use_gene, 7922 use_protein=True, 7923 full_format=False, 7924 use_version=use_version, 7925 codon_type=codon_type, 7926 ) 7927 hgvs_full_list.append(hgvs_name) 7928 7929 # Create liste of HGVS annotations 7930 hgvs_full = ",".join(hgvs_full_list) 7931 7932 return hgvs_full 7933 7934 # Polars connexion 7935 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7936 7937 # Config 7938 config = self.get_config() 7939 7940 # Databases 7941 # Genome 7942 databases_genomes_folders = ( 7943 config.get("folders", {}) 7944 .get("databases", {}) 7945 .get("genomes", DEFAULT_GENOME_FOLDER) 7946 ) 7947 databases_genome = ( 7948 config.get("folders", {}).get("databases", {}).get("genomes", "") 7949 ) 7950 # refseq database folder 7951 databases_refseq_folders = ( 7952 config.get("folders", {}) 7953 .get("databases", {}) 7954 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7955 ) 7956 # refseq 7957 databases_refseq = config.get("databases", {}).get("refSeq", None) 7958 # refSeqLink 7959 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7960 7961 # Param 7962 param = self.get_param() 7963 7964 # Quick HGVS 7965 if "hgvs_options" in param and param.get("hgvs_options", ""): 7966 log.info(f"Quick HGVS Annotation:") 7967 if not param.get("hgvs", None): 7968 param["hgvs"] = {} 7969 for option in param.get("hgvs_options", "").split(","): 7970 option_var_val = option.split("=") 7971 option_var = option_var_val[0] 7972 if len(option_var_val) > 1: 7973 option_val = option_var_val[1] 7974 else: 7975 option_val = "True" 7976 if option_val.upper() in ["TRUE"]: 7977 option_val = True 7978 elif option_val.upper() in ["FALSE"]: 7979 option_val = False 7980 log.info(f" {option_var}={option_val}") 7981 param["hgvs"][option_var] = option_val 7982 7983 # Check if HGVS annotation enabled 7984 if "hgvs" in param: 7985 log.info(f"HGVS Annotation... ") 7986 for hgvs_option in param.get("hgvs", {}): 7987 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7988 else: 7989 return 7990 7991 # HGVS Param 7992 param_hgvs = param.get("hgvs", {}) 7993 use_exon = param_hgvs.get("use_exon", False) 7994 use_gene = param_hgvs.get("use_gene", False) 7995 use_protein = param_hgvs.get("use_protein", False) 7996 add_protein = param_hgvs.get("add_protein", False) 7997 full_format = param_hgvs.get("full_format", False) 7998 use_version = param_hgvs.get("use_version", False) 7999 codon_type = param_hgvs.get("codon_type", "3") 8000 8001 # refSseq refSeqLink 8002 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8003 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8004 8005 # Assembly 8006 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8007 8008 # Genome 8009 genome_file = None 8010 if find_genome(databases_genome): 8011 genome_file = find_genome(databases_genome) 8012 else: 8013 genome_file = find_genome( 8014 genome_path=databases_genomes_folders, assembly=assembly 8015 ) 8016 log.debug("Genome: " + str(genome_file)) 8017 8018 # refSseq 8019 refseq_file = find_file_prefix( 8020 input_file=databases_refseq, 8021 prefix="ncbiRefSeq", 8022 folder=databases_refseq_folders, 8023 assembly=assembly, 8024 ) 8025 log.debug("refSeq: " + str(refseq_file)) 8026 8027 # refSeqLink 8028 refseqlink_file = find_file_prefix( 8029 input_file=databases_refseqlink, 8030 prefix="ncbiRefSeqLink", 8031 folder=databases_refseq_folders, 8032 assembly=assembly, 8033 ) 8034 log.debug("refSeqLink: " + str(refseqlink_file)) 8035 8036 # Threads 8037 if not threads: 8038 threads = self.get_threads() 8039 log.debug("Threads: " + str(threads)) 8040 8041 # Variables 8042 table_variants = self.get_table_variants(clause="update") 8043 8044 # Get variants SNV and InDel only 8045 query_variants = f""" 8046 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8047 FROM {table_variants} 8048 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8049 """ 8050 df_variants = self.get_query_to_df(query_variants) 8051 8052 # Added columns 8053 added_columns = [] 8054 8055 # Add hgvs column in variants table 8056 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8057 added_column = self.add_column( 8058 table_variants, hgvs_column_name, "STRING", default_value=None 8059 ) 8060 added_columns.append(added_column) 8061 8062 log.debug(f"refSeq loading...") 8063 # refSeq in duckDB 8064 refseq_table = get_refseq_table( 8065 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8066 ) 8067 # Loading all refSeq in Dataframe 8068 refseq_query = f""" 8069 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8070 FROM {refseq_table} 8071 JOIN df_variants ON ( 8072 {refseq_table}.chrom = df_variants.CHROM 8073 AND {refseq_table}.txStart<=df_variants.POS 8074 AND {refseq_table}.txEnd>=df_variants.POS 8075 ) 8076 """ 8077 refseq_df = self.conn.query(refseq_query).pl() 8078 8079 if refseqlink_file: 8080 log.debug(f"refSeqLink loading...") 8081 # refSeqLink in duckDB 8082 refseqlink_table = get_refseq_table( 8083 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8084 ) 8085 # Loading all refSeqLink in Dataframe 8086 protacc_column = "protAcc_with_ver" 8087 mrnaacc_column = "mrnaAcc_with_ver" 8088 refseqlink_query = f""" 8089 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8090 FROM {refseqlink_table} 8091 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8092 WHERE protAcc_without_ver IS NOT NULL 8093 """ 8094 # Polars Dataframe 8095 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8096 8097 # Read RefSeq transcripts into a python dict/model. 8098 log.debug(f"Transcripts loading...") 8099 with tempfile.TemporaryDirectory() as tmpdir: 8100 transcripts_query = f""" 8101 COPY ( 8102 SELECT {refseq_table}.* 8103 FROM {refseq_table} 8104 JOIN df_variants ON ( 8105 {refseq_table}.chrom=df_variants.CHROM 8106 AND {refseq_table}.txStart<=df_variants.POS 8107 AND {refseq_table}.txEnd>=df_variants.POS 8108 ) 8109 ) 8110 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8111 """ 8112 self.conn.query(transcripts_query) 8113 with open(f"{tmpdir}/transcript.tsv") as infile: 8114 transcripts = read_transcripts(infile) 8115 8116 # Polars connexion 8117 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8118 8119 log.debug("Genome loading...") 8120 # Read genome sequence using pyfaidx. 8121 genome = Fasta(genome_file) 8122 8123 log.debug("Start annotation HGVS...") 8124 8125 # Create 8126 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8127 ddf = dd.from_pandas(df_variants, npartitions=threads) 8128 8129 # Use dask.dataframe.apply() to apply function on each partition 8130 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8131 8132 # Convert Dask DataFrame to Pandas Dataframe 8133 df = ddf.compute() 8134 8135 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8136 with tempfile.TemporaryDirectory() as tmpdir: 8137 df_parquet = os.path.join(tmpdir, "df.parquet") 8138 df.to_parquet(df_parquet) 8139 8140 # Update hgvs column 8141 update_variant_query = f""" 8142 UPDATE {table_variants} 8143 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8144 FROM read_parquet('{df_parquet}') as df 8145 WHERE variants."#CHROM" = df.CHROM 8146 AND variants.POS = df.POS 8147 AND variants.REF = df.REF 8148 AND variants.ALT = df.ALT 8149 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8150 """ 8151 self.execute_query(update_variant_query) 8152 8153 # Update INFO column 8154 sql_query_update = f""" 8155 UPDATE {table_variants} 8156 SET INFO = 8157 concat( 8158 CASE 8159 WHEN INFO NOT IN ('','.') 8160 THEN concat(INFO, ';') 8161 ELSE '' 8162 END, 8163 'hgvs=', 8164 {hgvs_column_name} 8165 ) 8166 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8167 """ 8168 self.execute_query(sql_query_update) 8169 8170 # Add header 8171 HGVS_INFOS = { 8172 "hgvs": { 8173 "ID": "hgvs", 8174 "Number": ".", 8175 "Type": "String", 8176 "Description": f"HGVS annotatation with HOWARD", 8177 } 8178 } 8179 8180 for field in HGVS_INFOS: 8181 field_ID = HGVS_INFOS[field]["ID"] 8182 field_description = HGVS_INFOS[field]["Description"] 8183 self.get_header().infos[field_ID] = vcf.parser._Info( 8184 field_ID, 8185 HGVS_INFOS[field]["Number"], 8186 HGVS_INFOS[field]["Type"], 8187 field_description, 8188 "unknown", 8189 "unknown", 8190 code_type_map[HGVS_INFOS[field]["Type"]], 8191 ) 8192 8193 # Remove added columns 8194 for added_column in added_columns: 8195 self.drop_column(column=added_column) 8196 8197 ### 8198 # Calculation 8199 ### 8200 8201 def get_operations_help( 8202 self, operations_config_dict: dict = {}, operations_config_file: str = None 8203 ) -> list: 8204 8205 # Init 8206 operations_help = [] 8207 8208 # operations 8209 operations = self.get_config_json( 8210 name="calculations", 8211 config_dict=operations_config_dict, 8212 config_file=operations_config_file, 8213 ) 8214 for op in operations: 8215 op_name = operations[op].get("name", op).upper() 8216 op_description = operations[op].get("description", op_name) 8217 op_available = operations[op].get("available", False) 8218 if op_available: 8219 operations_help.append(f" {op_name}: {op_description}") 8220 8221 # Sort operations 8222 operations_help.sort() 8223 8224 # insert header 8225 operations_help.insert(0, "Available calculation operations:") 8226 8227 # Return 8228 return operations_help 8229 8230 def calculation( 8231 self, 8232 operations: dict = {}, 8233 operations_config_dict: dict = {}, 8234 operations_config_file: str = None, 8235 ) -> None: 8236 """ 8237 It takes a list of operations, and for each operation, it checks if it's a python or sql 8238 operation, and then calls the appropriate function 8239 8240 param json example: 8241 "calculation": { 8242 "NOMEN": { 8243 "options": { 8244 "hgvs_field": "hgvs" 8245 }, 8246 "middle" : null 8247 } 8248 """ 8249 8250 # Param 8251 param = self.get_param() 8252 8253 # CHeck operations config file 8254 if operations_config_file is None: 8255 operations_config_file = param.get("calculation", {}).get( 8256 "calculation_config", None 8257 ) 8258 8259 # operations config 8260 operations_config = self.get_config_json( 8261 name="calculations", 8262 config_dict=operations_config_dict, 8263 config_file=operations_config_file, 8264 ) 8265 8266 # Upper keys 8267 operations_config = {k.upper(): v for k, v in operations_config.items()} 8268 8269 # Calculations 8270 8271 # Operations from param 8272 operations = param.get("calculation", {}).get("calculations", operations) 8273 8274 # Quick calculation - add 8275 if param.get("calculations", None): 8276 8277 # List of operations 8278 calculations_list = [ 8279 value.strip() for value in param.get("calculations", "").split(",") 8280 ] 8281 8282 # Log 8283 log.info(f"Quick Calculations:") 8284 for calculation_key in calculations_list: 8285 log.info(f" {calculation_key}") 8286 8287 # Create tmp operations (to keep operation order) 8288 operations_tmp = {} 8289 for calculation_operation in calculations_list: 8290 if calculation_operation.upper() not in operations_tmp: 8291 log.debug( 8292 f"{calculation_operation}.upper() not in {operations_tmp}" 8293 ) 8294 operations_tmp[calculation_operation.upper()] = {} 8295 add_value_into_dict( 8296 dict_tree=operations_tmp, 8297 sections=[ 8298 calculation_operation.upper(), 8299 ], 8300 value=operations.get(calculation_operation.upper(), {}), 8301 ) 8302 # Add operations already in param 8303 for calculation_operation in operations: 8304 if calculation_operation not in operations_tmp: 8305 operations_tmp[calculation_operation] = operations.get( 8306 calculation_operation, {} 8307 ) 8308 8309 # Update operations in param 8310 operations = operations_tmp 8311 8312 # Operations for calculation 8313 if not operations: 8314 operations = param.get("calculation", {}).get("calculations", {}) 8315 8316 if operations: 8317 log.info(f"Calculations...") 8318 8319 # For each operations 8320 for operation_name in operations: 8321 operation_name = operation_name.upper() 8322 if operation_name not in [""]: 8323 if operation_name in operations_config: 8324 log.info(f"Calculation '{operation_name}'") 8325 operation = operations_config[operation_name] 8326 operation_type = operation.get("type", "sql") 8327 if operation_type == "python": 8328 self.calculation_process_function( 8329 operation=operation, operation_name=operation_name 8330 ) 8331 elif operation_type == "sql": 8332 self.calculation_process_sql( 8333 operation=operation, operation_name=operation_name 8334 ) 8335 else: 8336 log.error( 8337 f"Operations config: Type '{operation_type}' NOT available" 8338 ) 8339 raise ValueError( 8340 f"Operations config: Type '{operation_type}' NOT available" 8341 ) 8342 else: 8343 log.error( 8344 f"Operations config: Calculation '{operation_name}' NOT available" 8345 ) 8346 raise ValueError( 8347 f"Operations config: Calculation '{operation_name}' NOT available" 8348 ) 8349 8350 # Explode INFOS fields into table fields 8351 if self.get_explode_infos(): 8352 self.explode_infos( 8353 prefix=self.get_explode_infos_prefix(), 8354 fields=self.get_explode_infos_fields(), 8355 force=True, 8356 ) 8357 8358 def calculation_process_sql( 8359 self, operation: dict, operation_name: str = "unknown" 8360 ) -> None: 8361 """ 8362 The `calculation_process_sql` function takes in a mathematical operation as a string and 8363 performs the operation, updating the specified table with the result. 8364 8365 :param operation: The `operation` parameter is a dictionary that contains information about the 8366 mathematical operation to be performed. It includes the following keys: 8367 :type operation: dict 8368 :param operation_name: The `operation_name` parameter is a string that represents the name of 8369 the mathematical operation being performed. It is used for logging and error handling purposes, 8370 defaults to unknown 8371 :type operation_name: str (optional) 8372 """ 8373 8374 # Operation infos 8375 operation_name = operation.get("name", "unknown") 8376 log.debug(f"process SQL {operation_name}") 8377 output_column_name = operation.get("output_column_name", operation_name) 8378 output_column_type = operation.get("output_column_type", "String") 8379 prefix = operation.get("explode_infos_prefix", "") 8380 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8381 output_column_description = operation.get( 8382 "output_column_description", f"{operation_name} operation" 8383 ) 8384 operation_query = operation.get("operation_query", None) 8385 if isinstance(operation_query, list): 8386 operation_query = " ".join(operation_query) 8387 operation_info_fields = operation.get("info_fields", []) 8388 operation_info_fields_check = operation.get("info_fields_check", False) 8389 operation_info = operation.get("operation_info", True) 8390 operation_table = operation.get( 8391 "table", self.get_table_variants(clause="alter") 8392 ) 8393 8394 # table variants 8395 if operation_table: 8396 table_variants = operation_table 8397 else: 8398 table_variants = self.get_table_variants(clause="alter") 8399 8400 if operation_query: 8401 8402 # Info fields check 8403 operation_info_fields_check_result = True 8404 if operation_info_fields_check: 8405 header_infos = self.get_header().infos 8406 for info_field in operation_info_fields: 8407 operation_info_fields_check_result = ( 8408 operation_info_fields_check_result 8409 and info_field in header_infos 8410 ) 8411 8412 # If info fields available 8413 if operation_info_fields_check_result: 8414 8415 # Added_columns 8416 added_columns = [] 8417 8418 # Create VCF header field 8419 vcf_reader = self.get_header() 8420 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8421 output_column_name, 8422 ".", 8423 output_column_type, 8424 output_column_description, 8425 "howard calculation", 8426 "0", 8427 self.code_type_map.get(output_column_type), 8428 ) 8429 8430 # Explode infos if needed 8431 log.debug(f"calculation_process_sql prefix {prefix}") 8432 added_columns += self.explode_infos( 8433 prefix=prefix, 8434 fields=[output_column_name] + operation_info_fields, 8435 force=False, 8436 table=table_variants, 8437 ) 8438 8439 # Create column 8440 added_column = self.add_column( 8441 table_name=table_variants, 8442 column_name=prefix + output_column_name, 8443 column_type=output_column_type_sql, 8444 default_value="null", 8445 ) 8446 added_columns.append(added_column) 8447 8448 # Operation calculation 8449 try: 8450 8451 # Query to update calculation column 8452 sql_update = f""" 8453 UPDATE {table_variants} 8454 SET "{prefix}{output_column_name}" = ({operation_query}) 8455 """ 8456 self.conn.execute(sql_update) 8457 8458 # Add to INFO 8459 if operation_info: 8460 sql_update_info = f""" 8461 UPDATE {table_variants} 8462 SET "INFO" = 8463 concat( 8464 CASE 8465 WHEN "INFO" IS NOT NULL 8466 THEN concat("INFO", ';') 8467 ELSE '' 8468 END, 8469 '{output_column_name}=', 8470 "{prefix}{output_column_name}" 8471 ) 8472 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8473 """ 8474 self.conn.execute(sql_update_info) 8475 8476 except: 8477 log.error( 8478 f"Operations config: Calculation '{operation_name}' query failed" 8479 ) 8480 raise ValueError( 8481 f"Operations config: Calculation '{operation_name}' query failed" 8482 ) 8483 8484 # Remove added columns 8485 for added_column in added_columns: 8486 log.debug(f"added_column: {added_column}") 8487 self.drop_column(column=added_column) 8488 8489 else: 8490 log.error( 8491 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8492 ) 8493 raise ValueError( 8494 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8495 ) 8496 8497 else: 8498 log.error( 8499 f"Operations config: Calculation '{operation_name}' query NOT defined" 8500 ) 8501 raise ValueError( 8502 f"Operations config: Calculation '{operation_name}' query NOT defined" 8503 ) 8504 8505 def calculation_process_function( 8506 self, operation: dict, operation_name: str = "unknown" 8507 ) -> None: 8508 """ 8509 The `calculation_process_function` takes in an operation dictionary and performs the specified 8510 function with the given parameters. 8511 8512 :param operation: The `operation` parameter is a dictionary that contains information about the 8513 operation to be performed. It has the following keys: 8514 :type operation: dict 8515 :param operation_name: The `operation_name` parameter is a string that represents the name of 8516 the operation being performed. It is used for logging purposes, defaults to unknown 8517 :type operation_name: str (optional) 8518 """ 8519 8520 operation_name = operation["name"] 8521 log.debug(f"process Python {operation_name}") 8522 function_name = operation["function_name"] 8523 function_params = operation["function_params"] 8524 getattr(self, function_name)(*function_params) 8525 8526 def calculation_variant_id(self) -> None: 8527 """ 8528 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8529 updates the INFO field of a variants table with the variant ID. 8530 """ 8531 8532 # variant_id annotation field 8533 variant_id_tag = self.get_variant_id_column() 8534 added_columns = [variant_id_tag] 8535 8536 # variant_id hgvs tags" 8537 vcf_infos_tags = { 8538 variant_id_tag: "howard variant ID annotation", 8539 } 8540 8541 # Variants table 8542 table_variants = self.get_table_variants() 8543 8544 # Header 8545 vcf_reader = self.get_header() 8546 8547 # Add variant_id to header 8548 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8549 variant_id_tag, 8550 ".", 8551 "String", 8552 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8553 "howard calculation", 8554 "0", 8555 self.code_type_map.get("String"), 8556 ) 8557 8558 # Update 8559 sql_update = f""" 8560 UPDATE {table_variants} 8561 SET "INFO" = 8562 concat( 8563 CASE 8564 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8565 THEN '' 8566 ELSE concat("INFO", ';') 8567 END, 8568 '{variant_id_tag}=', 8569 "{variant_id_tag}" 8570 ) 8571 """ 8572 self.conn.execute(sql_update) 8573 8574 # Remove added columns 8575 for added_column in added_columns: 8576 self.drop_column(column=added_column) 8577 8578 def calculation_extract_snpeff_hgvs( 8579 self, 8580 snpeff_hgvs: str = "snpeff_hgvs", 8581 snpeff_field: str = "ANN", 8582 ) -> None: 8583 """ 8584 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8585 annotation field in a VCF file and adds them as a new column in the variants table. 8586 8587 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8588 function is used to specify the name of the column that will store the HGVS nomenclatures 8589 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8590 snpeff_hgvs 8591 :type snpeff_hgvs: str (optional) 8592 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8593 function represents the field in the VCF file that contains SnpEff annotations. This field is 8594 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8595 to ANN 8596 :type snpeff_field: str (optional) 8597 """ 8598 8599 # Snpeff hgvs tags 8600 vcf_infos_tags = { 8601 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8602 } 8603 8604 # Prefix 8605 prefix = self.get_explode_infos_prefix() 8606 if prefix: 8607 prefix = "INFO/" 8608 8609 # snpEff fields 8610 speff_ann_infos = prefix + snpeff_field 8611 speff_hgvs_infos = prefix + snpeff_hgvs 8612 8613 # Variants table 8614 table_variants = self.get_table_variants() 8615 8616 # Header 8617 vcf_reader = self.get_header() 8618 8619 # Add columns 8620 added_columns = [] 8621 8622 # Explode HGVS field in column 8623 added_columns += self.explode_infos(fields=[snpeff_field]) 8624 8625 if snpeff_field in vcf_reader.infos: 8626 8627 log.debug(vcf_reader.infos[snpeff_field]) 8628 8629 # Extract ANN header 8630 ann_description = vcf_reader.infos[snpeff_field].desc 8631 pattern = r"'(.+?)'" 8632 match = re.search(pattern, ann_description) 8633 if match: 8634 ann_header_match = match.group(1).split(" | ") 8635 ann_header_desc = {} 8636 for i in range(len(ann_header_match)): 8637 ann_header_info = "".join( 8638 char for char in ann_header_match[i] if char.isalnum() 8639 ) 8640 ann_header_desc[ann_header_info] = ann_header_match[i] 8641 if not ann_header_desc: 8642 raise ValueError("Invalid header description format") 8643 else: 8644 raise ValueError("Invalid header description format") 8645 8646 # Create variant id 8647 variant_id_column = self.get_variant_id_column() 8648 added_columns += [variant_id_column] 8649 8650 # Create dataframe 8651 dataframe_snpeff_hgvs = self.get_query_to_df( 8652 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8653 ) 8654 8655 # Create main NOMEN column 8656 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8657 speff_ann_infos 8658 ].apply( 8659 lambda x: extract_snpeff_hgvs( 8660 str(x), header=list(ann_header_desc.values()) 8661 ) 8662 ) 8663 8664 # Add snpeff_hgvs to header 8665 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8666 snpeff_hgvs, 8667 ".", 8668 "String", 8669 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8670 "howard calculation", 8671 "0", 8672 self.code_type_map.get("String"), 8673 ) 8674 8675 # Update 8676 sql_update = f""" 8677 UPDATE variants 8678 SET "INFO" = 8679 concat( 8680 CASE 8681 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8682 THEN '' 8683 ELSE concat("INFO", ';') 8684 END, 8685 CASE 8686 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8687 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8688 THEN concat( 8689 '{snpeff_hgvs}=', 8690 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8691 ) 8692 ELSE '' 8693 END 8694 ) 8695 FROM dataframe_snpeff_hgvs 8696 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8697 8698 """ 8699 self.conn.execute(sql_update) 8700 8701 # Delete dataframe 8702 del dataframe_snpeff_hgvs 8703 gc.collect() 8704 8705 else: 8706 8707 log.warning( 8708 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8709 ) 8710 8711 # Remove added columns 8712 for added_column in added_columns: 8713 self.drop_column(column=added_column) 8714 8715 def calculation_snpeff_ann_explode( 8716 self, 8717 uniquify: bool = True, 8718 output_format: str = "fields", 8719 output_prefix: str = "snpeff_", 8720 snpeff_field: str = "ANN", 8721 ) -> None: 8722 """ 8723 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8724 exploding the HGVS field and updating variant information accordingly. 8725 8726 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8727 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8728 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8729 defaults to True 8730 :type uniquify: bool (optional) 8731 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8732 function specifies the format in which the output annotations will be generated. It has a 8733 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8734 format, defaults to fields 8735 :type output_format: str (optional) 8736 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8737 method is used to specify the prefix that will be added to the output annotations generated 8738 during the calculation process. This prefix helps to differentiate the newly added annotations 8739 from existing ones in the output data. By default, the, defaults to ANN_ 8740 :type output_prefix: str (optional) 8741 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8742 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8743 field will be processed to explode the HGVS annotations and update the variant information 8744 accordingly, defaults to ANN 8745 :type snpeff_field: str (optional) 8746 """ 8747 8748 # SnpEff annotation field 8749 snpeff_hgvs = "snpeff_ann_explode" 8750 8751 # Snpeff hgvs tags 8752 vcf_infos_tags = { 8753 snpeff_hgvs: "Explode snpEff annotations", 8754 } 8755 8756 # Prefix 8757 prefix = self.get_explode_infos_prefix() 8758 if prefix: 8759 prefix = "INFO/" 8760 8761 # snpEff fields 8762 speff_ann_infos = prefix + snpeff_field 8763 speff_hgvs_infos = prefix + snpeff_hgvs 8764 8765 # Variants table 8766 table_variants = self.get_table_variants() 8767 8768 # Header 8769 vcf_reader = self.get_header() 8770 8771 # Add columns 8772 added_columns = [] 8773 8774 # Explode HGVS field in column 8775 added_columns += self.explode_infos(fields=[snpeff_field]) 8776 log.debug(f"snpeff_field={snpeff_field}") 8777 log.debug(f"added_columns={added_columns}") 8778 8779 if snpeff_field in vcf_reader.infos: 8780 8781 # Extract ANN header 8782 ann_description = vcf_reader.infos[snpeff_field].desc 8783 pattern = r"'(.+?)'" 8784 match = re.search(pattern, ann_description) 8785 if match: 8786 ann_header_match = match.group(1).split(" | ") 8787 ann_header = [] 8788 ann_header_desc = {} 8789 for i in range(len(ann_header_match)): 8790 ann_header_info = "".join( 8791 char for char in ann_header_match[i] if char.isalnum() 8792 ) 8793 ann_header.append(ann_header_info) 8794 ann_header_desc[ann_header_info] = ann_header_match[i] 8795 if not ann_header_desc: 8796 raise ValueError("Invalid header description format") 8797 else: 8798 raise ValueError("Invalid header description format") 8799 8800 # Create variant id 8801 variant_id_column = self.get_variant_id_column() 8802 added_columns += [variant_id_column] 8803 8804 # Create dataframe 8805 dataframe_snpeff_hgvs = self.get_query_to_df( 8806 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8807 ) 8808 8809 # Create snpEff columns 8810 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8811 speff_ann_infos 8812 ].apply( 8813 lambda x: explode_snpeff_ann( 8814 str(x), 8815 uniquify=uniquify, 8816 output_format=output_format, 8817 prefix=output_prefix, 8818 header=list(ann_header_desc.values()), 8819 ) 8820 ) 8821 8822 # Header 8823 ann_annotations_prefix = "" 8824 if output_format.upper() in ["JSON"]: 8825 ann_annotations_prefix = f"{output_prefix}=" 8826 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8827 output_prefix, 8828 ".", 8829 "String", 8830 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8831 + " - JSON format", 8832 "howard calculation", 8833 "0", 8834 self.code_type_map.get("String"), 8835 ) 8836 else: 8837 for ann_annotation in ann_header: 8838 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8839 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8840 ann_annotation_id, 8841 ".", 8842 "String", 8843 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8844 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8845 "howard calculation", 8846 "0", 8847 self.code_type_map.get("String"), 8848 ) 8849 8850 # Update 8851 sql_update = f""" 8852 UPDATE variants 8853 SET "INFO" = 8854 concat( 8855 CASE 8856 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8857 THEN '' 8858 ELSE concat("INFO", ';') 8859 END, 8860 CASE 8861 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8862 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8863 THEN concat( 8864 '{ann_annotations_prefix}', 8865 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8866 ) 8867 ELSE '' 8868 END 8869 ) 8870 FROM dataframe_snpeff_hgvs 8871 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8872 8873 """ 8874 self.conn.execute(sql_update) 8875 8876 # Delete dataframe 8877 del dataframe_snpeff_hgvs 8878 gc.collect() 8879 8880 else: 8881 8882 log.warning( 8883 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8884 ) 8885 8886 # Remove added columns 8887 for added_column in added_columns: 8888 self.drop_column(column=added_column) 8889 8890 def calculation_extract_nomen(self) -> None: 8891 """ 8892 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8893 """ 8894 8895 # NOMEN field 8896 field_nomen_dict = "NOMEN_DICT" 8897 8898 # NOMEN structure 8899 nomen_dict = { 8900 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8901 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8902 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8903 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8904 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8905 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8906 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8907 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8908 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8909 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8910 } 8911 8912 # Param 8913 param = self.get_param() 8914 8915 # Prefix 8916 prefix = self.get_explode_infos_prefix() 8917 8918 # Header 8919 vcf_reader = self.get_header() 8920 8921 # Added columns 8922 added_columns = [] 8923 8924 # Get HGVS field 8925 hgvs_field = ( 8926 param.get("calculation", {}) 8927 .get("calculations", {}) 8928 .get("NOMEN", {}) 8929 .get("options", {}) 8930 .get("hgvs_field", "hgvs") 8931 ) 8932 8933 # Get NOMEN pattern 8934 nomen_pattern = ( 8935 param.get("calculation", {}) 8936 .get("calculations", {}) 8937 .get("NOMEN", {}) 8938 .get("options", {}) 8939 .get("pattern", None) 8940 ) 8941 8942 # transcripts list of preference sources 8943 transcripts_sources = {} 8944 8945 # Get transcripts 8946 transcripts_file = ( 8947 param.get("calculation", {}) 8948 .get("calculations", {}) 8949 .get("NOMEN", {}) 8950 .get("options", {}) 8951 .get("transcripts", None) 8952 ) 8953 transcripts_file = full_path(transcripts_file) 8954 if transcripts_file: 8955 if os.path.exists(transcripts_file): 8956 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8957 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8958 transcripts_sources["file"] = transcripts_from_file 8959 else: 8960 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8961 log.error(msg_err) 8962 raise ValueError(msg_err) 8963 8964 # Get transcripts table 8965 transcripts_table = ( 8966 param.get("calculation", {}) 8967 .get("calculations", {}) 8968 .get("NOMEN", {}) 8969 .get("options", {}) 8970 .get("transcripts_table", self.get_table_variants()) 8971 ) 8972 # Get transcripts column 8973 transcripts_column = ( 8974 param.get("calculation", {}) 8975 .get("calculations", {}) 8976 .get("NOMEN", {}) 8977 .get("options", {}) 8978 .get("transcripts_column", None) 8979 ) 8980 8981 if transcripts_table and transcripts_column: 8982 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8983 # Explode if not exists 8984 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8985 else: 8986 extra_field_transcript = f"NULL" 8987 8988 # Transcripts of preference source order 8989 transcripts_order = ( 8990 param.get("calculation", {}) 8991 .get("calculations", {}) 8992 .get("NOMEN", {}) 8993 .get("options", {}) 8994 .get("transcripts_order", ["column", "file"]) 8995 ) 8996 8997 # Transcripts from file 8998 transcripts = transcripts_sources.get("file", []) 8999 9000 # Explode HGVS field in column 9001 added_columns += self.explode_infos(fields=[hgvs_field]) 9002 9003 # extra infos 9004 extra_infos = self.get_extra_infos() 9005 extra_field = prefix + hgvs_field 9006 9007 if extra_field in extra_infos: 9008 9009 # Create dataframe 9010 dataframe_hgvs = self.get_query_to_df( 9011 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9012 ) 9013 9014 # Create main NOMEN column 9015 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9016 lambda x: find_nomen( 9017 hgvs=x.hgvs, 9018 transcript=x.transcript, 9019 transcripts=transcripts, 9020 pattern=nomen_pattern, 9021 transcripts_source_order=transcripts_order, 9022 ), 9023 axis=1, 9024 ) 9025 9026 # Explode NOMEN Structure and create SQL set for update 9027 sql_nomen_fields = [] 9028 for nomen_field in nomen_dict: 9029 9030 # Explode each field into a column 9031 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9032 lambda x: dict(x).get(nomen_field, "") 9033 ) 9034 9035 # Create VCF header field 9036 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9037 nomen_field, 9038 ".", 9039 "String", 9040 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9041 "howard calculation", 9042 "0", 9043 self.code_type_map.get("String"), 9044 ) 9045 sql_nomen_fields.append( 9046 f""" 9047 CASE 9048 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9049 THEN concat( 9050 ';{nomen_field}=', 9051 dataframe_hgvs."{nomen_field}" 9052 ) 9053 ELSE '' 9054 END 9055 """ 9056 ) 9057 9058 # SQL set for update 9059 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9060 9061 # Update 9062 sql_update = f""" 9063 UPDATE variants 9064 SET "INFO" = 9065 concat( 9066 CASE 9067 WHEN "INFO" IS NULL 9068 THEN '' 9069 ELSE "INFO" 9070 END, 9071 {sql_nomen_fields_set} 9072 ) 9073 FROM dataframe_hgvs 9074 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9075 AND variants."POS" = dataframe_hgvs."POS" 9076 AND variants."REF" = dataframe_hgvs."REF" 9077 AND variants."ALT" = dataframe_hgvs."ALT" 9078 """ 9079 self.conn.execute(sql_update) 9080 9081 # Delete dataframe 9082 del dataframe_hgvs 9083 gc.collect() 9084 9085 # Remove added columns 9086 for added_column in added_columns: 9087 self.drop_column(column=added_column) 9088 9089 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9090 """ 9091 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9092 pipeline/sample for a variant and updates the variant information in a VCF file. 9093 9094 :param tag: The `tag` parameter is a string that represents the annotation field for the 9095 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9096 VCF header and to update the corresponding field in the variants table, defaults to 9097 findbypipeline 9098 :type tag: str (optional) 9099 """ 9100 9101 # if FORMAT and samples 9102 if ( 9103 "FORMAT" in self.get_header_columns_as_list() 9104 and self.get_header_sample_list() 9105 ): 9106 9107 # findbypipeline annotation field 9108 findbypipeline_tag = tag 9109 9110 # VCF infos tags 9111 vcf_infos_tags = { 9112 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9113 } 9114 9115 # Prefix 9116 prefix = self.get_explode_infos_prefix() 9117 9118 # Field 9119 findbypipeline_infos = prefix + findbypipeline_tag 9120 9121 # Variants table 9122 table_variants = self.get_table_variants() 9123 9124 # Header 9125 vcf_reader = self.get_header() 9126 9127 # Create variant id 9128 variant_id_column = self.get_variant_id_column() 9129 added_columns = [variant_id_column] 9130 9131 # variant_id, FORMAT and samples 9132 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9133 self.get_header_sample_list() 9134 ) 9135 9136 # Create dataframe 9137 dataframe_findbypipeline = self.get_query_to_df( 9138 f""" SELECT {samples_fields} FROM {table_variants} """ 9139 ) 9140 9141 # Create findbypipeline column 9142 dataframe_findbypipeline[findbypipeline_infos] = ( 9143 dataframe_findbypipeline.apply( 9144 lambda row: findbypipeline( 9145 row, samples=self.get_header_sample_list() 9146 ), 9147 axis=1, 9148 ) 9149 ) 9150 9151 # Add snpeff_hgvs to header 9152 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9153 findbypipeline_tag, 9154 ".", 9155 "String", 9156 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9157 "howard calculation", 9158 "0", 9159 self.code_type_map.get("String"), 9160 ) 9161 9162 # Update 9163 sql_update = f""" 9164 UPDATE variants 9165 SET "INFO" = 9166 concat( 9167 CASE 9168 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9169 THEN '' 9170 ELSE concat("INFO", ';') 9171 END, 9172 CASE 9173 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9174 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9175 THEN concat( 9176 '{findbypipeline_tag}=', 9177 dataframe_findbypipeline."{findbypipeline_infos}" 9178 ) 9179 ELSE '' 9180 END 9181 ) 9182 FROM dataframe_findbypipeline 9183 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9184 """ 9185 self.conn.execute(sql_update) 9186 9187 # Remove added columns 9188 for added_column in added_columns: 9189 self.drop_column(column=added_column) 9190 9191 # Delete dataframe 9192 del dataframe_findbypipeline 9193 gc.collect() 9194 9195 def calculation_genotype_concordance(self) -> None: 9196 """ 9197 The function `calculation_genotype_concordance` calculates the genotype concordance for 9198 multi-caller VCF files and updates the variant information in the database. 9199 """ 9200 9201 # if FORMAT and samples 9202 if ( 9203 "FORMAT" in self.get_header_columns_as_list() 9204 and self.get_header_sample_list() 9205 ): 9206 9207 # genotypeconcordance annotation field 9208 genotypeconcordance_tag = "genotypeconcordance" 9209 9210 # VCF infos tags 9211 vcf_infos_tags = { 9212 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9213 } 9214 9215 # Prefix 9216 prefix = self.get_explode_infos_prefix() 9217 9218 # Field 9219 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9220 9221 # Variants table 9222 table_variants = self.get_table_variants() 9223 9224 # Header 9225 vcf_reader = self.get_header() 9226 9227 # Create variant id 9228 variant_id_column = self.get_variant_id_column() 9229 added_columns = [variant_id_column] 9230 9231 # variant_id, FORMAT and samples 9232 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9233 self.get_header_sample_list() 9234 ) 9235 9236 # Create dataframe 9237 dataframe_genotypeconcordance = self.get_query_to_df( 9238 f""" SELECT {samples_fields} FROM {table_variants} """ 9239 ) 9240 9241 # Create genotypeconcordance column 9242 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9243 dataframe_genotypeconcordance.apply( 9244 lambda row: genotypeconcordance( 9245 row, samples=self.get_header_sample_list() 9246 ), 9247 axis=1, 9248 ) 9249 ) 9250 9251 # Add genotypeconcordance to header 9252 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9253 genotypeconcordance_tag, 9254 ".", 9255 "String", 9256 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9257 "howard calculation", 9258 "0", 9259 self.code_type_map.get("String"), 9260 ) 9261 9262 # Update 9263 sql_update = f""" 9264 UPDATE variants 9265 SET "INFO" = 9266 concat( 9267 CASE 9268 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9269 THEN '' 9270 ELSE concat("INFO", ';') 9271 END, 9272 CASE 9273 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9274 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9275 THEN concat( 9276 '{genotypeconcordance_tag}=', 9277 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9278 ) 9279 ELSE '' 9280 END 9281 ) 9282 FROM dataframe_genotypeconcordance 9283 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9284 """ 9285 self.conn.execute(sql_update) 9286 9287 # Remove added columns 9288 for added_column in added_columns: 9289 self.drop_column(column=added_column) 9290 9291 # Delete dataframe 9292 del dataframe_genotypeconcordance 9293 gc.collect() 9294 9295 def calculation_barcode(self, tag: str = "barcode") -> None: 9296 """ 9297 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9298 updates the INFO field in the file with the calculated barcode values. 9299 9300 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9301 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9302 the default tag name is set to "barcode", defaults to barcode 9303 :type tag: str (optional) 9304 """ 9305 9306 # if FORMAT and samples 9307 if ( 9308 "FORMAT" in self.get_header_columns_as_list() 9309 and self.get_header_sample_list() 9310 ): 9311 9312 # barcode annotation field 9313 if not tag: 9314 tag = "barcode" 9315 9316 # VCF infos tags 9317 vcf_infos_tags = { 9318 tag: "barcode calculation (VaRank)", 9319 } 9320 9321 # Prefix 9322 prefix = self.get_explode_infos_prefix() 9323 9324 # Field 9325 barcode_infos = prefix + tag 9326 9327 # Variants table 9328 table_variants = self.get_table_variants() 9329 9330 # Header 9331 vcf_reader = self.get_header() 9332 9333 # Create variant id 9334 variant_id_column = self.get_variant_id_column() 9335 added_columns = [variant_id_column] 9336 9337 # variant_id, FORMAT and samples 9338 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9339 self.get_header_sample_list() 9340 ) 9341 9342 # Create dataframe 9343 dataframe_barcode = self.get_query_to_df( 9344 f""" SELECT {samples_fields} FROM {table_variants} """ 9345 ) 9346 9347 # Create barcode column 9348 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9349 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9350 ) 9351 9352 # Add barcode to header 9353 vcf_reader.infos[tag] = vcf.parser._Info( 9354 tag, 9355 ".", 9356 "String", 9357 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9358 "howard calculation", 9359 "0", 9360 self.code_type_map.get("String"), 9361 ) 9362 9363 # Update 9364 sql_update = f""" 9365 UPDATE {table_variants} 9366 SET "INFO" = 9367 concat( 9368 CASE 9369 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9370 THEN '' 9371 ELSE concat("INFO", ';') 9372 END, 9373 CASE 9374 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9375 AND dataframe_barcode."{barcode_infos}" NOT NULL 9376 THEN concat( 9377 '{tag}=', 9378 dataframe_barcode."{barcode_infos}" 9379 ) 9380 ELSE '' 9381 END 9382 ) 9383 FROM dataframe_barcode 9384 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9385 """ 9386 self.conn.execute(sql_update) 9387 9388 # Remove added columns 9389 for added_column in added_columns: 9390 self.drop_column(column=added_column) 9391 9392 # Delete dataframe 9393 del dataframe_barcode 9394 gc.collect() 9395 9396 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9397 """ 9398 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9399 and updates the INFO field in the file with the calculated barcode values. 9400 9401 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9402 the barcode tag that will be added to the VCF file during the calculation process. If no value 9403 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9404 :type tag: str (optional) 9405 """ 9406 9407 # if FORMAT and samples 9408 if ( 9409 "FORMAT" in self.get_header_columns_as_list() 9410 and self.get_header_sample_list() 9411 ): 9412 9413 # barcode annotation field 9414 if not tag: 9415 tag = "BCF" 9416 9417 # VCF infos tags 9418 vcf_infos_tags = { 9419 tag: "barcode family calculation", 9420 f"{tag}S": "barcode family samples", 9421 } 9422 9423 # Param 9424 param = self.get_param() 9425 log.debug(f"param={param}") 9426 9427 # Prefix 9428 prefix = self.get_explode_infos_prefix() 9429 9430 # PED param 9431 ped = ( 9432 param.get("calculation", {}) 9433 .get("calculations", {}) 9434 .get("BARCODEFAMILY", {}) 9435 .get("family_pedigree", None) 9436 ) 9437 log.debug(f"ped={ped}") 9438 9439 # Load PED 9440 if ped: 9441 9442 # Pedigree is a file 9443 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9444 log.debug("Pedigree is file") 9445 with open(full_path(ped)) as ped: 9446 ped = yaml.safe_load(ped) 9447 9448 # Pedigree is a string 9449 elif isinstance(ped, str): 9450 log.debug("Pedigree is str") 9451 try: 9452 ped = json.loads(ped) 9453 log.debug("Pedigree is json str") 9454 except ValueError as e: 9455 ped_samples = ped.split(",") 9456 ped = {} 9457 for ped_sample in ped_samples: 9458 ped[ped_sample] = ped_sample 9459 9460 # Pedigree is a dict 9461 elif isinstance(ped, dict): 9462 log.debug("Pedigree is dict") 9463 9464 # Pedigree is not well formatted 9465 else: 9466 msg_error = "Pedigree not well formatted" 9467 log.error(msg_error) 9468 raise ValueError(msg_error) 9469 9470 # Construct list 9471 ped_samples = list(ped.values()) 9472 9473 else: 9474 log.debug("Pedigree not defined. Take all samples") 9475 ped_samples = self.get_header_sample_list() 9476 ped = {} 9477 for ped_sample in ped_samples: 9478 ped[ped_sample] = ped_sample 9479 9480 # Check pedigree 9481 if not ped or len(ped) == 0: 9482 msg_error = f"Error in pedigree: samples {ped_samples}" 9483 log.error(msg_error) 9484 raise ValueError(msg_error) 9485 9486 # Log 9487 log.info( 9488 "Calculation 'BARCODEFAMILY' - Samples: " 9489 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9490 ) 9491 log.debug(f"ped_samples={ped_samples}") 9492 9493 # Field 9494 barcode_infos = prefix + tag 9495 9496 # Variants table 9497 table_variants = self.get_table_variants() 9498 9499 # Header 9500 vcf_reader = self.get_header() 9501 9502 # Create variant id 9503 variant_id_column = self.get_variant_id_column() 9504 added_columns = [variant_id_column] 9505 9506 # variant_id, FORMAT and samples 9507 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9508 ped_samples 9509 ) 9510 9511 # Create dataframe 9512 dataframe_barcode = self.get_query_to_df( 9513 f""" SELECT {samples_fields} FROM {table_variants} """ 9514 ) 9515 9516 # Create barcode column 9517 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9518 lambda row: barcode(row, samples=ped_samples), axis=1 9519 ) 9520 9521 # Add barcode family to header 9522 # Add vaf_normalization to header 9523 vcf_reader.formats[tag] = vcf.parser._Format( 9524 id=tag, 9525 num=".", 9526 type="String", 9527 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9528 type_code=self.code_type_map.get("String"), 9529 ) 9530 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9531 id=f"{tag}S", 9532 num=".", 9533 type="String", 9534 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9535 type_code=self.code_type_map.get("String"), 9536 ) 9537 9538 # Update 9539 # for sample in ped_samples: 9540 sql_update_set = [] 9541 for sample in self.get_header_sample_list() + ["FORMAT"]: 9542 if sample in ped_samples: 9543 value = f'dataframe_barcode."{barcode_infos}"' 9544 value_samples = "'" + ",".join(ped_samples) + "'" 9545 elif sample == "FORMAT": 9546 value = f"'{tag}'" 9547 value_samples = f"'{tag}S'" 9548 else: 9549 value = "'.'" 9550 value_samples = "'.'" 9551 format_regex = r"[a-zA-Z0-9\s]" 9552 sql_update_set.append( 9553 f""" 9554 "{sample}" = 9555 concat( 9556 CASE 9557 WHEN {table_variants}."{sample}" = './.' 9558 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9559 ELSE {table_variants}."{sample}" 9560 END, 9561 ':', 9562 {value}, 9563 ':', 9564 {value_samples} 9565 ) 9566 """ 9567 ) 9568 9569 sql_update_set_join = ", ".join(sql_update_set) 9570 sql_update = f""" 9571 UPDATE {table_variants} 9572 SET {sql_update_set_join} 9573 FROM dataframe_barcode 9574 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9575 """ 9576 self.conn.execute(sql_update) 9577 9578 # Remove added columns 9579 for added_column in added_columns: 9580 self.drop_column(column=added_column) 9581 9582 # Delete dataframe 9583 del dataframe_barcode 9584 gc.collect() 9585 9586 def calculation_trio(self) -> None: 9587 """ 9588 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9589 information to the INFO field of each variant. 9590 """ 9591 9592 # if FORMAT and samples 9593 if ( 9594 "FORMAT" in self.get_header_columns_as_list() 9595 and self.get_header_sample_list() 9596 ): 9597 9598 # trio annotation field 9599 trio_tag = "trio" 9600 9601 # VCF infos tags 9602 vcf_infos_tags = { 9603 "trio": "trio calculation", 9604 } 9605 9606 # Param 9607 param = self.get_param() 9608 9609 # Prefix 9610 prefix = self.get_explode_infos_prefix() 9611 9612 # Trio param 9613 trio_ped = ( 9614 param.get("calculation", {}) 9615 .get("calculations", {}) 9616 .get("TRIO", {}) 9617 .get("trio_pedigree", None) 9618 ) 9619 9620 # Load trio 9621 if trio_ped: 9622 9623 # Trio pedigree is a file 9624 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9625 log.debug("TRIO pedigree is file") 9626 with open(full_path(trio_ped)) as trio_ped: 9627 trio_ped = yaml.safe_load(trio_ped) 9628 9629 # Trio pedigree is a string 9630 elif isinstance(trio_ped, str): 9631 log.debug("TRIO pedigree is str") 9632 try: 9633 trio_ped = json.loads(trio_ped) 9634 log.debug("TRIO pedigree is json str") 9635 except ValueError as e: 9636 trio_samples = trio_ped.split(",") 9637 if len(trio_samples) == 3: 9638 trio_ped = { 9639 "father": trio_samples[0], 9640 "mother": trio_samples[1], 9641 "child": trio_samples[2], 9642 } 9643 log.debug("TRIO pedigree is list str") 9644 else: 9645 msg_error = "TRIO pedigree not well formatted" 9646 log.error(msg_error) 9647 raise ValueError(msg_error) 9648 9649 # Trio pedigree is a dict 9650 elif isinstance(trio_ped, dict): 9651 log.debug("TRIO pedigree is dict") 9652 9653 # Trio pedigree is not well formatted 9654 else: 9655 msg_error = "TRIO pedigree not well formatted" 9656 log.error(msg_error) 9657 raise ValueError(msg_error) 9658 9659 # Construct trio list 9660 trio_samples = [ 9661 trio_ped.get("father", ""), 9662 trio_ped.get("mother", ""), 9663 trio_ped.get("child", ""), 9664 ] 9665 9666 else: 9667 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9668 samples_list = self.get_header_sample_list() 9669 if len(samples_list) >= 3: 9670 trio_samples = self.get_header_sample_list()[0:3] 9671 trio_ped = { 9672 "father": trio_samples[0], 9673 "mother": trio_samples[1], 9674 "child": trio_samples[2], 9675 } 9676 else: 9677 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9678 log.error(msg_error) 9679 raise ValueError(msg_error) 9680 9681 # Check trio pedigree 9682 if not trio_ped or len(trio_ped) != 3: 9683 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9684 log.error(msg_error) 9685 raise ValueError(msg_error) 9686 9687 # Log 9688 log.info( 9689 f"Calculation 'TRIO' - Samples: " 9690 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9691 ) 9692 9693 # Field 9694 trio_infos = prefix + trio_tag 9695 9696 # Variants table 9697 table_variants = self.get_table_variants() 9698 9699 # Header 9700 vcf_reader = self.get_header() 9701 9702 # Create variant id 9703 variant_id_column = self.get_variant_id_column() 9704 added_columns = [variant_id_column] 9705 9706 # variant_id, FORMAT and samples 9707 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9708 self.get_header_sample_list() 9709 ) 9710 9711 # Create dataframe 9712 dataframe_trio = self.get_query_to_df( 9713 f""" SELECT {samples_fields} FROM {table_variants} """ 9714 ) 9715 9716 # Create trio column 9717 dataframe_trio[trio_infos] = dataframe_trio.apply( 9718 lambda row: trio(row, samples=trio_samples), axis=1 9719 ) 9720 9721 # Add trio to header 9722 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9723 trio_tag, 9724 ".", 9725 "String", 9726 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9727 "howard calculation", 9728 "0", 9729 self.code_type_map.get("String"), 9730 ) 9731 9732 # Update 9733 sql_update = f""" 9734 UPDATE {table_variants} 9735 SET "INFO" = 9736 concat( 9737 CASE 9738 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9739 THEN '' 9740 ELSE concat("INFO", ';') 9741 END, 9742 CASE 9743 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9744 AND dataframe_trio."{trio_infos}" NOT NULL 9745 THEN concat( 9746 '{trio_tag}=', 9747 dataframe_trio."{trio_infos}" 9748 ) 9749 ELSE '' 9750 END 9751 ) 9752 FROM dataframe_trio 9753 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9754 """ 9755 self.conn.execute(sql_update) 9756 9757 # Remove added columns 9758 for added_column in added_columns: 9759 self.drop_column(column=added_column) 9760 9761 # Delete dataframe 9762 del dataframe_trio 9763 gc.collect() 9764 9765 def calculation_vaf_normalization(self) -> None: 9766 """ 9767 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9768 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9769 :return: The function does not return anything. 9770 """ 9771 9772 # if FORMAT and samples 9773 if ( 9774 "FORMAT" in self.get_header_columns_as_list() 9775 and self.get_header_sample_list() 9776 ): 9777 9778 # vaf_normalization annotation field 9779 vaf_normalization_tag = "VAF" 9780 9781 # VCF infos tags 9782 vcf_infos_tags = { 9783 "VAF": "VAF Variant Frequency", 9784 } 9785 9786 # Prefix 9787 prefix = self.get_explode_infos_prefix() 9788 9789 # Variants table 9790 table_variants = self.get_table_variants() 9791 9792 # Header 9793 vcf_reader = self.get_header() 9794 9795 # Do not calculate if VAF already exists 9796 if "VAF" in vcf_reader.formats: 9797 log.debug("VAF already on genotypes") 9798 return 9799 9800 # Create variant id 9801 variant_id_column = self.get_variant_id_column() 9802 added_columns = [variant_id_column] 9803 9804 # variant_id, FORMAT and samples 9805 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9806 f""" "{sample}" """ for sample in self.get_header_sample_list() 9807 ) 9808 9809 # Create dataframe 9810 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9811 log.debug(f"query={query}") 9812 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9813 9814 vaf_normalization_set = [] 9815 9816 # for each sample vaf_normalization 9817 for sample in self.get_header_sample_list(): 9818 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9819 lambda row: vaf_normalization(row, sample=sample), axis=1 9820 ) 9821 vaf_normalization_set.append( 9822 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9823 ) 9824 9825 # Add VAF to FORMAT 9826 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9827 "FORMAT" 9828 ].apply(lambda x: str(x) + ":VAF") 9829 vaf_normalization_set.append( 9830 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9831 ) 9832 9833 # Add vaf_normalization to header 9834 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9835 id=vaf_normalization_tag, 9836 num="1", 9837 type="Float", 9838 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9839 type_code=self.code_type_map.get("Float"), 9840 ) 9841 9842 # Create fields to add in INFO 9843 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9844 9845 # Update 9846 sql_update = f""" 9847 UPDATE {table_variants} 9848 SET {sql_vaf_normalization_set} 9849 FROM dataframe_vaf_normalization 9850 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9851 9852 """ 9853 self.conn.execute(sql_update) 9854 9855 # Remove added columns 9856 for added_column in added_columns: 9857 self.drop_column(column=added_column) 9858 9859 # Delete dataframe 9860 del dataframe_vaf_normalization 9861 gc.collect() 9862 9863 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9864 """ 9865 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9866 field in a VCF file and updates the INFO column of the variants table with the calculated 9867 statistics. 9868 9869 :param info: The `info` parameter is a string that represents the type of information for which 9870 genotype statistics are calculated. It is used to generate various VCF info tags for the 9871 statistics, such as the number of occurrences, the list of values, the minimum value, the 9872 maximum value, the mean, the median, defaults to VAF 9873 :type info: str (optional) 9874 """ 9875 9876 # if FORMAT and samples 9877 if ( 9878 "FORMAT" in self.get_header_columns_as_list() 9879 and self.get_header_sample_list() 9880 ): 9881 9882 # vaf_stats annotation field 9883 vaf_stats_tag = info + "_stats" 9884 9885 # VCF infos tags 9886 vcf_infos_tags = { 9887 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9888 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9889 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9890 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9891 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9892 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9893 info 9894 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9895 } 9896 9897 # Prefix 9898 prefix = self.get_explode_infos_prefix() 9899 9900 # Field 9901 vaf_stats_infos = prefix + vaf_stats_tag 9902 9903 # Variants table 9904 table_variants = self.get_table_variants() 9905 9906 # Header 9907 vcf_reader = self.get_header() 9908 9909 # Create variant id 9910 variant_id_column = self.get_variant_id_column() 9911 added_columns = [variant_id_column] 9912 9913 # variant_id, FORMAT and samples 9914 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9915 self.get_header_sample_list() 9916 ) 9917 9918 # Create dataframe 9919 dataframe_vaf_stats = self.get_query_to_df( 9920 f""" SELECT {samples_fields} FROM {table_variants} """ 9921 ) 9922 9923 # Create vaf_stats column 9924 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9925 lambda row: genotype_stats( 9926 row, samples=self.get_header_sample_list(), info=info 9927 ), 9928 axis=1, 9929 ) 9930 9931 # List of vcf tags 9932 sql_vaf_stats_fields = [] 9933 9934 # Check all VAF stats infos 9935 for stat in vcf_infos_tags: 9936 9937 # Extract stats 9938 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9939 lambda x: dict(x).get(stat, "") 9940 ) 9941 9942 # Add snpeff_hgvs to header 9943 vcf_reader.infos[stat] = vcf.parser._Info( 9944 stat, 9945 ".", 9946 "String", 9947 vcf_infos_tags.get(stat, "genotype statistics"), 9948 "howard calculation", 9949 "0", 9950 self.code_type_map.get("String"), 9951 ) 9952 9953 if len(sql_vaf_stats_fields): 9954 sep = ";" 9955 else: 9956 sep = "" 9957 9958 # Create fields to add in INFO 9959 sql_vaf_stats_fields.append( 9960 f""" 9961 CASE 9962 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9963 THEN concat( 9964 '{sep}{stat}=', 9965 dataframe_vaf_stats."{stat}" 9966 ) 9967 ELSE '' 9968 END 9969 """ 9970 ) 9971 9972 # SQL set for update 9973 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9974 9975 # Update 9976 sql_update = f""" 9977 UPDATE {table_variants} 9978 SET "INFO" = 9979 concat( 9980 CASE 9981 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9982 THEN '' 9983 ELSE concat("INFO", ';') 9984 END, 9985 {sql_vaf_stats_fields_set} 9986 ) 9987 FROM dataframe_vaf_stats 9988 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9989 9990 """ 9991 self.conn.execute(sql_update) 9992 9993 # Remove added columns 9994 for added_column in added_columns: 9995 self.drop_column(column=added_column) 9996 9997 # Delete dataframe 9998 del dataframe_vaf_stats 9999 gc.collect() 10000 10001 def calculation_transcripts_annotation( 10002 self, info_json: str = None, info_format: str = None 10003 ) -> None: 10004 """ 10005 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10006 field to it if transcripts are available. 10007 10008 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10009 is a string parameter that represents the information field to be used in the transcripts JSON. 10010 It is used to specify the JSON format for the transcripts information. If no value is provided 10011 when calling the method, it defaults to " 10012 :type info_json: str 10013 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10014 method is a string parameter that specifies the format of the information field to be used in 10015 the transcripts JSON. It is used to define the format of the information field 10016 :type info_format: str 10017 """ 10018 10019 # Create transcripts table 10020 transcripts_table = self.create_transcript_view() 10021 10022 # Add info field 10023 if transcripts_table: 10024 self.transcript_view_to_variants( 10025 transcripts_table=transcripts_table, 10026 transcripts_info_field_json=info_json, 10027 transcripts_info_field_format=info_format, 10028 ) 10029 else: 10030 log.info("No Transcripts to process. Check param.json file configuration") 10031 10032 def calculation_transcripts_prioritization(self) -> None: 10033 """ 10034 The function `calculation_transcripts_prioritization` creates a transcripts table and 10035 prioritizes transcripts based on certain criteria. 10036 """ 10037 10038 # Create transcripts table 10039 transcripts_table = self.create_transcript_view() 10040 10041 # Add info field 10042 if transcripts_table: 10043 self.transcripts_prioritization(transcripts_table=transcripts_table) 10044 else: 10045 log.info("No Transcripts to process. Check param.json file configuration") 10046 10047 def calculation_transcripts_export(self) -> None: 10048 """ """ 10049 10050 # Create transcripts table 10051 transcripts_table = self.create_transcript_view() 10052 10053 # Add info field 10054 if transcripts_table: 10055 self.transcripts_export(transcripts_table=transcripts_table) 10056 else: 10057 log.info("No Transcripts to process. Check param.json file configuration") 10058 10059 ############### 10060 # Transcripts # 10061 ############### 10062 10063 def transcripts_export( 10064 self, transcripts_table: str = None, param: dict = {} 10065 ) -> bool: 10066 """ """ 10067 10068 log.debug("Start transcripts export...") 10069 10070 # Param 10071 if not param: 10072 param = self.get_param() 10073 10074 # Param export 10075 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10076 10077 # Output file 10078 transcripts_export_output = param_transcript_export.get("output", None) 10079 10080 if not param_transcript_export or not transcripts_export_output: 10081 log.warning(f"No transcriipts export parameters defined!") 10082 return False 10083 10084 # List of transcripts annotations 10085 query_describe = f""" 10086 SELECT column_name 10087 FROM ( 10088 DESCRIBE SELECT * FROM {transcripts_table} 10089 ) 10090 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10091 """ 10092 transcripts_annotations_list = list( 10093 self.get_query_to_df(query=query_describe)["column_name"] 10094 ) 10095 10096 # Create transcripts table for export 10097 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10098 random.choices(string.ascii_uppercase + string.digits, k=10) 10099 ) 10100 query_create_transcripts_table_export = f""" 10101 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10102 """ 10103 self.execute_query(query=query_create_transcripts_table_export) 10104 10105 # Output file format 10106 transcripts_export_output_format = get_file_format( 10107 filename=transcripts_export_output 10108 ) 10109 10110 # Format VCF - construct INFO 10111 if transcripts_export_output_format in ["vcf"]: 10112 10113 # Construct query update INFO and header 10114 query_update_info = [] 10115 for field in transcripts_annotations_list: 10116 10117 # If field not in header 10118 if field not in self.get_header_infos_list(): 10119 10120 # Add PZ Transcript in header 10121 self.get_header().infos[field] = vcf.parser._Info( 10122 field, 10123 ".", 10124 "String", 10125 f"Annotation '{field}' from transcript view", 10126 "unknown", 10127 "unknown", 10128 0, 10129 ) 10130 10131 # Add field as INFO/tag 10132 query_update_info.append( 10133 f""" 10134 CASE 10135 WHEN "{field}" IS NOT NULL 10136 THEN concat('{field}=', "{field}", ';') 10137 ELSE '' 10138 END 10139 """ 10140 ) 10141 10142 # Query param 10143 query_update_info_value = ( 10144 f""" concat('', {", ".join(query_update_info)}) """ 10145 ) 10146 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10147 10148 else: 10149 10150 # Query param 10151 query_update_info_value = f""" NULL """ 10152 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10153 10154 # Update query INFO column 10155 query_update = f""" 10156 UPDATE {transcripts_table_export} 10157 SET INFO = {query_update_info_value} 10158 10159 """ 10160 self.execute_query(query=query_update) 10161 10162 # Export 10163 self.export_output( 10164 output_file=transcripts_export_output, 10165 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10166 ) 10167 10168 # Drop transcripts export table 10169 query_drop_transcripts_table_export = f""" 10170 DROP TABLE {transcripts_table_export} 10171 """ 10172 self.execute_query(query=query_drop_transcripts_table_export) 10173 10174 def transcripts_prioritization( 10175 self, transcripts_table: str = None, param: dict = {} 10176 ) -> bool: 10177 """ 10178 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10179 and updates the variants table with the prioritized information. 10180 10181 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10182 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10183 This parameter is used to identify the table where the transcripts data is stored for the 10184 prioritization process 10185 :type transcripts_table: str 10186 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10187 that contains various configuration settings for the prioritization process of transcripts. It 10188 is used to customize the behavior of the prioritization algorithm and includes settings such as 10189 the prefix for prioritization fields, default profiles, and other 10190 :type param: dict 10191 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10192 transcripts prioritization process is successfully completed, and `False` if there are any 10193 issues or if no profile is defined for transcripts prioritization. 10194 """ 10195 10196 log.debug("Start transcripts prioritization...") 10197 10198 # Param 10199 if not param: 10200 param = self.get_param() 10201 10202 # Variants table 10203 table_variants = self.get_table_variants() 10204 10205 # Transcripts table 10206 if transcripts_table is None: 10207 transcripts_table = self.create_transcript_view( 10208 transcripts_table="transcripts", param=param 10209 ) 10210 if transcripts_table is None: 10211 msg_err = "No Transcripts table availalble" 10212 log.error(msg_err) 10213 raise ValueError(msg_err) 10214 log.debug(f"transcripts_table={transcripts_table}") 10215 10216 # Get transcripts columns 10217 columns_as_list_query = f""" 10218 DESCRIBE {transcripts_table} 10219 """ 10220 columns_as_list = list( 10221 self.get_query_to_df(columns_as_list_query)["column_name"] 10222 ) 10223 10224 # Create INFO if not exists 10225 if "INFO" not in columns_as_list: 10226 query_add_info = f""" 10227 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10228 """ 10229 self.execute_query(query_add_info) 10230 10231 # Prioritization param and Force only PZ Score and Flag 10232 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10233 10234 # PZ profile by default 10235 pz_profile_default = ( 10236 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10237 ) 10238 10239 # Exit if no profile 10240 if pz_profile_default is None: 10241 log.warning("No profile defined for transcripts prioritization") 10242 return False 10243 10244 # PZ fields 10245 pz_param_pzfields = {} 10246 10247 # PZ field transcripts 10248 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10249 10250 # Add PZ Transcript in header 10251 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10252 pz_fields_transcripts, 10253 ".", 10254 "String", 10255 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10256 "unknown", 10257 "unknown", 10258 code_type_map["String"], 10259 ) 10260 10261 # Mandatory fields 10262 pz_mandatory_fields_list = [ 10263 "Score", 10264 "Flag", 10265 "Tags", 10266 "Comment", 10267 "Infos", 10268 "Class", 10269 ] 10270 pz_mandatory_fields = [] 10271 for pz_mandatory_field in pz_mandatory_fields_list: 10272 pz_mandatory_fields.append( 10273 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10274 ) 10275 10276 # PZ fields in param 10277 for pz_field in pz_param.get("pzfields", []): 10278 if pz_field in pz_mandatory_fields_list: 10279 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10280 pz_param.get("pzprefix", "PTZ") + pz_field 10281 ) 10282 else: 10283 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10284 pz_param_pzfields[pz_field] = pz_field_new 10285 10286 # Add PZ Transcript in header 10287 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10288 pz_field_new, 10289 ".", 10290 "String", 10291 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10292 "unknown", 10293 "unknown", 10294 code_type_map["String"], 10295 ) 10296 10297 # PZ fields param 10298 pz_param["pzfields"] = pz_mandatory_fields 10299 10300 # Prioritization 10301 prioritization_result = self.prioritization( 10302 table=transcripts_table, 10303 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10304 ) 10305 if not prioritization_result: 10306 log.warning("Transcripts prioritization not processed") 10307 return False 10308 10309 # PZ fields sql query 10310 query_update_select_list = [] 10311 query_update_concat_list = [] 10312 query_update_order_list = [] 10313 for pz_param_pzfield in set( 10314 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10315 ): 10316 query_update_select_list.append(f" {pz_param_pzfield}, ") 10317 10318 for pz_param_pzfield in pz_param_pzfields: 10319 query_update_concat_list.append( 10320 f""" 10321 , CASE 10322 WHEN {pz_param_pzfield} IS NOT NULL 10323 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10324 ELSE '' 10325 END 10326 """ 10327 ) 10328 10329 # Order by 10330 pz_orders = ( 10331 param.get("transcripts", {}) 10332 .get("prioritization", {}) 10333 .get("prioritization_transcripts_order", {}) 10334 ) 10335 if not pz_orders: 10336 pz_orders = { 10337 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10338 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10339 } 10340 for pz_order in pz_orders: 10341 query_update_order_list.append( 10342 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10343 ) 10344 10345 # Fields to explode 10346 fields_to_explode = ( 10347 list(pz_param_pzfields.keys()) 10348 + pz_mandatory_fields 10349 + list(pz_orders.keys()) 10350 ) 10351 # Remove transcript column as a specific transcript column 10352 if "transcript" in fields_to_explode: 10353 fields_to_explode.remove("transcript") 10354 10355 # Fields intranscripts table 10356 query_transcripts_table = f""" 10357 DESCRIBE SELECT * FROM {transcripts_table} 10358 """ 10359 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10360 10361 # Check fields to explode 10362 for field_to_explode in fields_to_explode: 10363 if field_to_explode not in self.get_header_infos_list() + list( 10364 query_transcripts_table.column_name 10365 ): 10366 msg_err = f"INFO/{field_to_explode} NOT IN header" 10367 log.error(msg_err) 10368 raise ValueError(msg_err) 10369 10370 # Explode fields to explode 10371 self.explode_infos( 10372 table=transcripts_table, 10373 fields=fields_to_explode, 10374 ) 10375 10376 # Transcript preference file 10377 transcripts_preference_file = ( 10378 param.get("transcripts", {}) 10379 .get("prioritization", {}) 10380 .get("prioritization_transcripts", {}) 10381 ) 10382 transcripts_preference_file = full_path(transcripts_preference_file) 10383 10384 # Transcript preference forced 10385 transcript_preference_force = ( 10386 param.get("transcripts", {}) 10387 .get("prioritization", {}) 10388 .get("prioritization_transcripts_force", False) 10389 ) 10390 # Transcript version forced 10391 transcript_version_force = ( 10392 param.get("transcripts", {}) 10393 .get("prioritization", {}) 10394 .get("prioritization_transcripts_version_force", False) 10395 ) 10396 10397 # Transcripts Ranking 10398 if transcripts_preference_file: 10399 10400 # Transcripts file to dataframe 10401 if os.path.exists(transcripts_preference_file): 10402 transcripts_preference_dataframe = transcripts_file_to_df( 10403 transcripts_preference_file 10404 ) 10405 else: 10406 log.error( 10407 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10408 ) 10409 raise ValueError( 10410 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10411 ) 10412 10413 # Order by depending to transcript preference forcing 10414 if transcript_preference_force: 10415 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10416 else: 10417 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10418 10419 # Transcript columns joined depend on version consideration 10420 if transcript_version_force: 10421 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10422 else: 10423 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10424 10425 # Query ranking for update 10426 query_update_ranking = f""" 10427 SELECT 10428 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10429 ROW_NUMBER() OVER ( 10430 PARTITION BY "#CHROM", POS, REF, ALT 10431 ORDER BY {order_by} 10432 ) AS rn 10433 FROM {transcripts_table} 10434 LEFT JOIN 10435 ( 10436 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10437 FROM transcripts_preference_dataframe 10438 ) AS transcripts_preference 10439 ON {transcripts_version_join} 10440 """ 10441 10442 else: 10443 10444 # Query ranking for update 10445 query_update_ranking = f""" 10446 SELECT 10447 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10448 ROW_NUMBER() OVER ( 10449 PARTITION BY "#CHROM", POS, REF, ALT 10450 ORDER BY {" , ".join(query_update_order_list)} 10451 ) AS rn 10452 FROM {transcripts_table} 10453 """ 10454 10455 # Export Transcripts prioritization infos to variants table 10456 query_update = f""" 10457 WITH RankedTranscripts AS ( 10458 {query_update_ranking} 10459 ) 10460 UPDATE {table_variants} 10461 SET 10462 INFO = CONCAT(CASE 10463 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10464 THEN '' 10465 ELSE concat("INFO", ';') 10466 END, 10467 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10468 ) 10469 FROM 10470 RankedTranscripts 10471 WHERE 10472 rn = 1 10473 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10474 AND variants."POS" = RankedTranscripts."POS" 10475 AND variants."REF" = RankedTranscripts."REF" 10476 AND variants."ALT" = RankedTranscripts."ALT" 10477 """ 10478 10479 # log.debug(f"query_update={query_update}") 10480 self.execute_query(query=query_update) 10481 10482 # Return 10483 return True 10484 10485 def create_transcript_view_from_columns_map( 10486 self, 10487 transcripts_table: str = "transcripts", 10488 columns_maps: dict = {}, 10489 added_columns: list = [], 10490 temporary_tables: list = None, 10491 annotation_fields: list = None, 10492 column_rename: dict = {}, 10493 column_clean: bool = False, 10494 column_case: str = None, 10495 ) -> tuple[list, list, list]: 10496 """ 10497 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10498 specified columns mapping for transcripts data. 10499 10500 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10501 of the table where the transcripts data is stored or will be stored in the database. This table 10502 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10503 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10504 :type transcripts_table: str (optional) 10505 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10506 about how to map columns from a transcripts table to create a view. Each entry in the 10507 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10508 typically includes details such as the main transcript column and additional information columns 10509 :type columns_maps: dict 10510 :param added_columns: The `added_columns` parameter in the 10511 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10512 that will be added to the view being created based on the columns map provided. These columns 10513 are generated by exploding the transcript information columns along with the main transcript 10514 column 10515 :type added_columns: list 10516 :param temporary_tables: The `temporary_tables` parameter in the 10517 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10518 tables created during the process of creating a transcript view from a columns map. These 10519 temporary tables are used to store intermediate results or transformations before the final view 10520 is generated 10521 :type temporary_tables: list 10522 :param annotation_fields: The `annotation_fields` parameter in the 10523 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10524 used for annotation in the query view creation process. These fields are extracted from the 10525 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10526 :type annotation_fields: list 10527 :param column_rename: The `column_rename` parameter in the 10528 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10529 custom renaming for columns during the creation of the temporary table view. This parameter 10530 provides a mapping of original column names to the desired renamed column names. By using this 10531 parameter, 10532 :type column_rename: dict 10533 :param column_clean: The `column_clean` parameter in the 10534 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10535 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10536 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10537 False 10538 :type column_clean: bool (optional) 10539 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10540 function is used to specify the case transformation to be applied to the columns during the view 10541 creation process. It allows you to control whether the column values should be converted to 10542 lowercase, uppercase, or remain unchanged 10543 :type column_case: str 10544 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10545 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10546 """ 10547 10548 log.debug("Start transcrpts view creation from columns map...") 10549 10550 # "from_columns_map": [ 10551 # { 10552 # "transcripts_column": "Ensembl_transcriptid", 10553 # "transcripts_infos_columns": [ 10554 # "genename", 10555 # "Ensembl_geneid", 10556 # "LIST_S2_score", 10557 # "LIST_S2_pred", 10558 # ], 10559 # }, 10560 # { 10561 # "transcripts_column": "Ensembl_transcriptid", 10562 # "transcripts_infos_columns": [ 10563 # "genename", 10564 # "VARITY_R_score", 10565 # "Aloft_pred", 10566 # ], 10567 # }, 10568 # ], 10569 10570 # Init 10571 if temporary_tables is None: 10572 temporary_tables = [] 10573 if annotation_fields is None: 10574 annotation_fields = [] 10575 10576 # Variants table 10577 table_variants = self.get_table_variants() 10578 10579 for columns_map in columns_maps: 10580 10581 # Transcript column 10582 transcripts_column = columns_map.get("transcripts_column", None) 10583 10584 # Transcripts infos columns 10585 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10586 10587 # Transcripts infos columns rename 10588 column_rename = columns_map.get("column_rename", column_rename) 10589 10590 # Transcripts infos columns clean 10591 column_clean = columns_map.get("column_clean", column_clean) 10592 10593 # Transcripts infos columns case 10594 column_case = columns_map.get("column_case", column_case) 10595 10596 if transcripts_column is not None: 10597 10598 # Explode 10599 added_columns += self.explode_infos( 10600 fields=[transcripts_column] + transcripts_infos_columns 10601 ) 10602 10603 # View clauses 10604 clause_select_variants = [] 10605 clause_select_tanscripts = [] 10606 for field in [transcripts_column] + transcripts_infos_columns: 10607 10608 # AS field 10609 as_field = field 10610 10611 # Rename 10612 if column_rename: 10613 as_field = column_rename.get(as_field, as_field) 10614 10615 # Clean 10616 if column_clean: 10617 as_field = clean_annotation_field(as_field) 10618 10619 # Case 10620 if column_case: 10621 if column_case.lower() in ["lower"]: 10622 as_field = as_field.lower() 10623 elif column_case.lower() in ["upper"]: 10624 as_field = as_field.upper() 10625 10626 # Clause select Variants 10627 clause_select_variants.append( 10628 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10629 ) 10630 10631 if field in [transcripts_column]: 10632 clause_select_tanscripts.append( 10633 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10634 ) 10635 else: 10636 clause_select_tanscripts.append( 10637 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10638 ) 10639 annotation_fields.append(as_field) 10640 10641 # Querey View 10642 query = f""" 10643 SELECT 10644 "#CHROM", POS, REF, ALT, INFO, 10645 "{transcripts_column}" AS 'transcript', 10646 {", ".join(clause_select_tanscripts)} 10647 FROM ( 10648 SELECT 10649 "#CHROM", POS, REF, ALT, INFO, 10650 {", ".join(clause_select_variants)} 10651 FROM {table_variants} 10652 ) 10653 WHERE "{transcripts_column}" IS NOT NULL 10654 """ 10655 10656 # Create temporary table 10657 temporary_table = transcripts_table + "".join( 10658 random.choices(string.ascii_uppercase + string.digits, k=10) 10659 ) 10660 10661 # Temporary_tables 10662 temporary_tables.append(temporary_table) 10663 query_view = f""" 10664 CREATE TEMPORARY TABLE {temporary_table} 10665 AS ({query}) 10666 """ 10667 self.execute_query(query=query_view) 10668 10669 return added_columns, temporary_tables, annotation_fields 10670 10671 def create_transcript_view_from_column_format( 10672 self, 10673 transcripts_table: str = "transcripts", 10674 column_formats: dict = {}, 10675 temporary_tables: list = None, 10676 annotation_fields: list = None, 10677 column_rename: dict = {}, 10678 column_clean: bool = False, 10679 column_case: str = None, 10680 ) -> tuple[list, list, list]: 10681 """ 10682 The `create_transcript_view_from_column_format` function generates a transcript view based on 10683 specified column formats, adds additional columns and annotation fields, and returns the list of 10684 temporary tables and annotation fields. 10685 10686 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10687 of the table containing the transcripts data. This table will be used as the base table for 10688 creating the transcript view. The default value for this parameter is "transcripts", but you can 10689 provide a different table name if needed, defaults to transcripts 10690 :type transcripts_table: str (optional) 10691 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10692 about the columns to be used for creating the transcript view. Each entry in the dictionary 10693 specifies the mapping between a transcripts column and a transcripts infos column. This 10694 parameter allows you to define how the columns from the transcripts table should be transformed 10695 or mapped 10696 :type column_formats: dict 10697 :param temporary_tables: The `temporary_tables` parameter in the 10698 `create_transcript_view_from_column_format` function is a list that stores the names of 10699 temporary views created during the process of creating a transcript view from a column format. 10700 These temporary views are used to manipulate and extract data before generating the final 10701 transcript view 10702 :type temporary_tables: list 10703 :param annotation_fields: The `annotation_fields` parameter in the 10704 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10705 that are extracted from the temporary views created during the process. These annotation fields 10706 are obtained by querying the temporary views and extracting the column names excluding specific 10707 columns like `#CH 10708 :type annotation_fields: list 10709 :param column_rename: The `column_rename` parameter in the 10710 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10711 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10712 column names to new column names in this dictionary, you can rename specific columns during the 10713 process 10714 :type column_rename: dict 10715 :param column_clean: The `column_clean` parameter in the 10716 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10717 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10718 will be cleaned during the creation of the transcript view based on the specified column format, 10719 defaults to False 10720 :type column_clean: bool (optional) 10721 :param column_case: The `column_case` parameter in the 10722 `create_transcript_view_from_column_format` function is used to specify the case transformation 10723 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10724 to convert the column names to uppercase or lowercase, respectively 10725 :type column_case: str 10726 :return: The `create_transcript_view_from_column_format` function returns two lists: 10727 `temporary_tables` and `annotation_fields`. 10728 """ 10729 10730 log.debug("Start transcrpts view creation from column format...") 10731 10732 # "from_column_format": [ 10733 # { 10734 # "transcripts_column": "ANN", 10735 # "transcripts_infos_column": "Feature_ID", 10736 # } 10737 # ], 10738 10739 # Init 10740 if temporary_tables is None: 10741 temporary_tables = [] 10742 if annotation_fields is None: 10743 annotation_fields = [] 10744 10745 for column_format in column_formats: 10746 10747 # annotation field and transcript annotation field 10748 annotation_field = column_format.get("transcripts_column", "ANN") 10749 transcript_annotation = column_format.get( 10750 "transcripts_infos_column", "Feature_ID" 10751 ) 10752 10753 # Transcripts infos columns rename 10754 column_rename = column_format.get("column_rename", column_rename) 10755 10756 # Transcripts infos columns clean 10757 column_clean = column_format.get("column_clean", column_clean) 10758 10759 # Transcripts infos columns case 10760 column_case = column_format.get("column_case", column_case) 10761 10762 # Temporary View name 10763 temporary_view_name = transcripts_table + "".join( 10764 random.choices(string.ascii_uppercase + string.digits, k=10) 10765 ) 10766 10767 # Create temporary view name 10768 temporary_view_name = self.annotation_format_to_table( 10769 uniquify=True, 10770 annotation_field=annotation_field, 10771 view_name=temporary_view_name, 10772 annotation_id=transcript_annotation, 10773 column_rename=column_rename, 10774 column_clean=column_clean, 10775 column_case=column_case, 10776 ) 10777 10778 # Annotation fields 10779 if temporary_view_name: 10780 query_annotation_fields = f""" 10781 SELECT * 10782 FROM ( 10783 DESCRIBE SELECT * 10784 FROM {temporary_view_name} 10785 ) 10786 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10787 """ 10788 df_annotation_fields = self.get_query_to_df( 10789 query=query_annotation_fields 10790 ) 10791 10792 # Add temporary view and annotation fields 10793 temporary_tables.append(temporary_view_name) 10794 annotation_fields += list(set(df_annotation_fields["column_name"])) 10795 10796 return temporary_tables, annotation_fields 10797 10798 def create_transcript_view( 10799 self, 10800 transcripts_table: str = None, 10801 transcripts_table_drop: bool = False, 10802 param: dict = {}, 10803 ) -> str: 10804 """ 10805 The `create_transcript_view` function generates a transcript view by processing data from a 10806 specified table based on provided parameters and structural information. 10807 10808 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10809 is used to specify the name of the table that will store the final transcript view data. If a table 10810 name is not provided, the function will create a new table to store the transcript view data, and by 10811 default,, defaults to transcripts 10812 :type transcripts_table: str (optional) 10813 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10814 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10815 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10816 the function will drop the existing transcripts table if it exists, defaults to False 10817 :type transcripts_table_drop: bool (optional) 10818 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10819 contains information needed to create a transcript view. It includes details such as the structure 10820 of the transcripts, columns mapping, column formats, and other necessary information for generating 10821 the view. This parameter allows for flexibility and customization 10822 :type param: dict 10823 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10824 created or modified during the execution of the function. 10825 """ 10826 10827 log.debug("Start transcripts view creation...") 10828 10829 # Default 10830 transcripts_table_default = "transcripts" 10831 10832 # Param 10833 if not param: 10834 param = self.get_param() 10835 10836 # Struct 10837 struct = param.get("transcripts", {}).get("struct", None) 10838 10839 # Transcript veresion 10840 transcript_id_remove_version = param.get("transcripts", {}).get( 10841 "transcript_id_remove_version", False 10842 ) 10843 10844 # Transcripts mapping 10845 transcript_id_mapping_file = param.get("transcripts", {}).get( 10846 "transcript_id_mapping_file", None 10847 ) 10848 10849 # Transcripts mapping 10850 transcript_id_mapping_force = param.get("transcripts", {}).get( 10851 "transcript_id_mapping_force", None 10852 ) 10853 10854 if struct: 10855 10856 # Transcripts table 10857 if transcripts_table is None: 10858 transcripts_table = param.get("transcripts", {}).get( 10859 "table", transcripts_table_default 10860 ) 10861 10862 # added_columns 10863 added_columns = [] 10864 10865 # Temporary tables 10866 temporary_tables = [] 10867 10868 # Annotation fields 10869 annotation_fields = [] 10870 10871 # from columns map 10872 columns_maps = struct.get("from_columns_map", []) 10873 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10874 self.create_transcript_view_from_columns_map( 10875 transcripts_table=transcripts_table, 10876 columns_maps=columns_maps, 10877 added_columns=added_columns, 10878 temporary_tables=temporary_tables, 10879 annotation_fields=annotation_fields, 10880 ) 10881 ) 10882 added_columns += added_columns_tmp 10883 temporary_tables += temporary_tables_tmp 10884 annotation_fields += annotation_fields_tmp 10885 10886 # from column format 10887 column_formats = struct.get("from_column_format", []) 10888 temporary_tables_tmp, annotation_fields_tmp = ( 10889 self.create_transcript_view_from_column_format( 10890 transcripts_table=transcripts_table, 10891 column_formats=column_formats, 10892 temporary_tables=temporary_tables, 10893 annotation_fields=annotation_fields, 10894 ) 10895 ) 10896 temporary_tables += temporary_tables_tmp 10897 annotation_fields += annotation_fields_tmp 10898 10899 # Remove some specific fields/column 10900 annotation_fields = list(set(annotation_fields)) 10901 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10902 if field in annotation_fields: 10903 annotation_fields.remove(field) 10904 10905 # Merge temporary tables query 10906 query_merge = "" 10907 for temporary_table in list(set(temporary_tables)): 10908 10909 # First temporary table 10910 if not query_merge: 10911 query_merge = f""" 10912 SELECT * FROM {temporary_table} 10913 """ 10914 # other temporary table (using UNION) 10915 else: 10916 query_merge += f""" 10917 UNION BY NAME SELECT * FROM {temporary_table} 10918 """ 10919 10920 # transcript table tmp 10921 transcript_table_tmp = "transcripts_tmp" 10922 transcript_table_tmp2 = "transcripts_tmp2" 10923 transcript_table_tmp3 = "transcripts_tmp3" 10924 10925 # Merge on transcript 10926 query_merge_on_transcripts_annotation_fields = [] 10927 10928 # Add transcript list 10929 query_merge_on_transcripts_annotation_fields.append( 10930 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10931 ) 10932 10933 # Aggregate all annotations fields 10934 for annotation_field in set(annotation_fields): 10935 query_merge_on_transcripts_annotation_fields.append( 10936 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10937 ) 10938 10939 # Transcripts mapping 10940 if transcript_id_mapping_file: 10941 10942 # Transcript dataframe 10943 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10944 transcript_id_mapping_dataframe = transcripts_file_to_df( 10945 transcript_id_mapping_file, column_names=["transcript", "alias"] 10946 ) 10947 10948 # Transcript version remove 10949 if transcript_id_remove_version: 10950 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10951 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10952 query_left_join = f""" 10953 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10954 """ 10955 else: 10956 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10957 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10958 query_left_join = f""" 10959 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10960 """ 10961 10962 # Transcript column for group by merge 10963 query_transcript_merge_group_by = """ 10964 CASE 10965 WHEN transcript_mapped NOT IN ('') 10966 THEN split_part(transcript_mapped, '.', 1) 10967 ELSE split_part(transcript_original, '.', 1) 10968 END 10969 """ 10970 10971 # Merge query 10972 transcripts_tmp2_query = f""" 10973 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10974 FROM ({query_merge}) AS {transcript_table_tmp} 10975 {query_left_join} 10976 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10977 """ 10978 10979 # Retrive columns after mege 10980 transcripts_tmp2_describe_query = f""" 10981 DESCRIBE {transcripts_tmp2_query} 10982 """ 10983 transcripts_tmp2_describe_list = list( 10984 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10985 "column_name" 10986 ] 10987 ) 10988 10989 # Create list of columns for select clause 10990 transcripts_tmp2_describe_select_clause = [] 10991 for field in transcripts_tmp2_describe_list: 10992 if field not in [ 10993 "#CHROM", 10994 "POS", 10995 "REF", 10996 "ALT", 10997 "INFO", 10998 "transcript_mapped", 10999 ]: 11000 as_field = field 11001 if field in ["transcript_original"]: 11002 as_field = "transcripts_mapped" 11003 transcripts_tmp2_describe_select_clause.append( 11004 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11005 ) 11006 11007 # Merge with mapping 11008 query_merge_on_transcripts = f""" 11009 SELECT 11010 "#CHROM", POS, REF, ALT, INFO, 11011 CASE 11012 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11013 THEN ANY_VALUE(transcript_mapped) 11014 ELSE ANY_VALUE(transcript_original) 11015 END AS transcript, 11016 {", ".join(transcripts_tmp2_describe_select_clause)} 11017 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11018 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11019 {query_transcript_merge_group_by} 11020 """ 11021 11022 # Add transcript filter from mapping file 11023 if transcript_id_mapping_force: 11024 query_merge_on_transcripts = f""" 11025 SELECT * 11026 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11027 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11028 """ 11029 11030 # No transcript mapping 11031 else: 11032 11033 # Remove transcript version 11034 if transcript_id_remove_version: 11035 query_transcript_column = f""" 11036 split_part({transcript_table_tmp}.transcript, '.', 1) 11037 """ 11038 else: 11039 query_transcript_column = """ 11040 transcript 11041 """ 11042 11043 # Query sections 11044 query_transcript_column_select = ( 11045 f"{query_transcript_column} AS transcript" 11046 ) 11047 query_transcript_column_group_by = query_transcript_column 11048 11049 # Query for transcripts view 11050 query_merge_on_transcripts = f""" 11051 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11052 FROM ({query_merge}) AS {transcript_table_tmp} 11053 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11054 """ 11055 11056 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11057 11058 # Drop transcript view is necessary 11059 if transcripts_table_drop: 11060 query_drop = f""" 11061 DROP TABLE IF EXISTS {transcripts_table}; 11062 """ 11063 self.execute_query(query=query_drop) 11064 11065 # Merge and create transcript view 11066 query_create_view = f""" 11067 CREATE TABLE IF NOT EXISTS {transcripts_table} 11068 AS {query_merge_on_transcripts} 11069 """ 11070 self.execute_query(query=query_create_view) 11071 11072 # Remove added columns 11073 for added_column in added_columns: 11074 self.drop_column(column=added_column) 11075 11076 else: 11077 11078 transcripts_table = None 11079 11080 return transcripts_table 11081 11082 def annotation_format_to_table( 11083 self, 11084 uniquify: bool = True, 11085 annotation_field: str = "ANN", 11086 annotation_id: str = "Feature_ID", 11087 view_name: str = "transcripts", 11088 column_rename: dict = {}, 11089 column_clean: bool = False, 11090 column_case: str = None, 11091 ) -> str: 11092 """ 11093 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11094 structured table format, ensuring unique values and creating a temporary table for further 11095 processing or analysis. 11096 11097 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11098 unique values in the output or not. If set to `True`, the function will make sure that the 11099 output values are unique, defaults to True 11100 :type uniquify: bool (optional) 11101 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11102 that contains the annotation information for each variant. This field is used to extract the 11103 annotation details for further processing in the function. By default, it is set to "ANN", 11104 defaults to ANN 11105 :type annotation_field: str (optional) 11106 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11107 is used to specify the identifier for the annotation feature. This identifier will be used as a 11108 column name in the resulting table or view that is created based on the annotation data. It 11109 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11110 :type annotation_id: str (optional) 11111 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11112 to specify the name of the temporary table that will be created to store the transformed 11113 annotation data. This table will hold the extracted information from the annotation field in a 11114 structured format for further processing or analysis. By default,, defaults to transcripts 11115 :type view_name: str (optional) 11116 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11117 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11118 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11119 created based on the annotation data. This feature enables 11120 :type column_rename: dict 11121 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11122 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11123 If set to `True`, the function will clean the annotation field before further processing. This 11124 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11125 to False 11126 :type column_clean: bool (optional) 11127 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11128 used to specify the case transformation to be applied to the column names extracted from the 11129 annotation data. It allows you to set the case of the column names to either lowercase or 11130 uppercase for consistency or other specific requirements during the conversion 11131 :type column_case: str 11132 :return: The function `annotation_format_to_table` is returning the name of the view created, 11133 which is stored in the variable `view_name`. 11134 """ 11135 11136 # Annotation field 11137 annotation_format = "annotation_explode" 11138 11139 # Transcript annotation 11140 if column_rename: 11141 annotation_id = column_rename.get(annotation_id, annotation_id) 11142 11143 if column_clean: 11144 annotation_id = clean_annotation_field(annotation_id) 11145 11146 # Prefix 11147 prefix = self.get_explode_infos_prefix() 11148 if prefix: 11149 prefix = "INFO/" 11150 11151 # Annotation fields 11152 annotation_infos = prefix + annotation_field 11153 annotation_format_infos = prefix + annotation_format 11154 11155 # Variants table 11156 table_variants = self.get_table_variants() 11157 11158 # Header 11159 vcf_reader = self.get_header() 11160 11161 # Add columns 11162 added_columns = [] 11163 11164 # Explode HGVS field in column 11165 added_columns += self.explode_infos(fields=[annotation_field]) 11166 11167 if annotation_field in vcf_reader.infos: 11168 11169 # Extract ANN header 11170 ann_description = vcf_reader.infos[annotation_field].desc 11171 pattern = r"'(.+?)'" 11172 match = re.search(pattern, ann_description) 11173 if match: 11174 ann_header_match = match.group(1).split(" | ") 11175 ann_header = [] 11176 ann_header_desc = {} 11177 for i in range(len(ann_header_match)): 11178 ann_header_info = "".join( 11179 char for char in ann_header_match[i] if char.isalnum() 11180 ) 11181 ann_header.append(ann_header_info) 11182 ann_header_desc[ann_header_info] = ann_header_match[i] 11183 if not ann_header_desc: 11184 raise ValueError("Invalid header description format") 11185 else: 11186 raise ValueError("Invalid header description format") 11187 11188 # Create variant id 11189 variant_id_column = self.get_variant_id_column() 11190 added_columns += [variant_id_column] 11191 11192 # Create dataframe 11193 dataframe_annotation_format = self.get_query_to_df( 11194 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11195 ) 11196 11197 # Create annotation columns 11198 dataframe_annotation_format[ 11199 annotation_format_infos 11200 ] = dataframe_annotation_format[annotation_infos].apply( 11201 lambda x: explode_annotation_format( 11202 annotation=str(x), 11203 uniquify=uniquify, 11204 output_format="JSON", 11205 prefix="", 11206 header=list(ann_header_desc.values()), 11207 ) 11208 ) 11209 11210 # Find keys 11211 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11212 df_keys = self.get_query_to_df(query=query_json) 11213 11214 # Check keys 11215 query_json_key = [] 11216 for _, row in df_keys.iterrows(): 11217 11218 # Key 11219 key = row.iloc[0] 11220 key_clean = key 11221 11222 # key rename 11223 if column_rename: 11224 key_clean = column_rename.get(key_clean, key_clean) 11225 11226 # key clean 11227 if column_clean: 11228 key_clean = clean_annotation_field(key_clean) 11229 11230 # Key case 11231 if column_case: 11232 if column_case.lower() in ["lower"]: 11233 key_clean = key_clean.lower() 11234 elif column_case.lower() in ["upper"]: 11235 key_clean = key_clean.upper() 11236 11237 # Type 11238 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11239 11240 # Get DataFrame from query 11241 df_json_type = self.get_query_to_df(query=query_json_type) 11242 11243 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11244 with pd.option_context("future.no_silent_downcasting", True): 11245 df_json_type.fillna(value="", inplace=True) 11246 replace_dict = {None: np.nan, "": np.nan} 11247 df_json_type.replace(replace_dict, inplace=True) 11248 df_json_type.dropna(inplace=True) 11249 11250 # Detect column type 11251 column_type = detect_column_type(df_json_type[key_clean]) 11252 11253 # Append 11254 query_json_key.append( 11255 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11256 ) 11257 11258 # Create view 11259 query_view = f""" 11260 CREATE TEMPORARY TABLE {view_name} 11261 AS ( 11262 SELECT *, {annotation_id} AS 'transcript' 11263 FROM ( 11264 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11265 FROM dataframe_annotation_format 11266 ) 11267 ); 11268 """ 11269 self.execute_query(query=query_view) 11270 11271 else: 11272 11273 # Return None 11274 view_name = None 11275 11276 # Remove added columns 11277 for added_column in added_columns: 11278 self.drop_column(column=added_column) 11279 11280 return view_name 11281 11282 def transcript_view_to_variants( 11283 self, 11284 transcripts_table: str = None, 11285 transcripts_column_id: str = None, 11286 transcripts_info_json: str = None, 11287 transcripts_info_field_json: str = None, 11288 transcripts_info_format: str = None, 11289 transcripts_info_field_format: str = None, 11290 param: dict = {}, 11291 ) -> bool: 11292 """ 11293 The `transcript_view_to_variants` function updates a variants table with information from 11294 transcripts in JSON format. 11295 11296 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11297 table containing the transcripts data. If this parameter is not provided, the function will 11298 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11299 :type transcripts_table: str 11300 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11301 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11302 identifier is used to match transcripts with variants in the database 11303 :type transcripts_column_id: str 11304 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11305 of the column in the variants table where the transcripts information will be stored in JSON 11306 format. This parameter allows you to define the column in the variants table that will hold the 11307 JSON-formatted information about transcripts 11308 :type transcripts_info_json: str 11309 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11310 specify the field in the VCF header that will contain information about transcripts in JSON 11311 format. This field will be added to the VCF header as an INFO field with the specified name 11312 :type transcripts_info_field_json: str 11313 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11314 format of the information about transcripts that will be stored in the variants table. This 11315 format can be used to define how the transcript information will be structured or displayed 11316 within the variants table 11317 :type transcripts_info_format: str 11318 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11319 specify the field in the VCF header that will contain information about transcripts in a 11320 specific format. This field will be added to the VCF header as an INFO field with the specified 11321 name 11322 :type transcripts_info_field_format: str 11323 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11324 that contains various configuration settings related to transcripts. It is used to provide 11325 default values for certain parameters if they are not explicitly provided when calling the 11326 method. The `param` dictionary can be passed as an argument 11327 :type param: dict 11328 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11329 if the operation is successful and `False` if certain conditions are not met. 11330 """ 11331 11332 msg_info_prefix = "Start transcripts view to variants annotations" 11333 11334 log.debug(f"{msg_info_prefix}...") 11335 11336 # Default 11337 transcripts_table_default = "transcripts" 11338 transcripts_column_id_default = "transcript" 11339 transcripts_info_json_default = None 11340 transcripts_info_format_default = None 11341 transcripts_info_field_json_default = None 11342 transcripts_info_field_format_default = None 11343 11344 # Param 11345 if not param: 11346 param = self.get_param() 11347 11348 # Transcripts table 11349 if transcripts_table is None: 11350 transcripts_table = param.get("transcripts", {}).get( 11351 "table", transcripts_table_default 11352 ) 11353 11354 # Transcripts column ID 11355 if transcripts_column_id is None: 11356 transcripts_column_id = param.get("transcripts", {}).get( 11357 "column_id", transcripts_column_id_default 11358 ) 11359 11360 # Transcripts info json 11361 if transcripts_info_json is None: 11362 transcripts_info_json = param.get("transcripts", {}).get( 11363 "transcripts_info_json", transcripts_info_json_default 11364 ) 11365 11366 # Transcripts info field JSON 11367 if transcripts_info_field_json is None: 11368 transcripts_info_field_json = param.get("transcripts", {}).get( 11369 "transcripts_info_field_json", transcripts_info_field_json_default 11370 ) 11371 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11372 # transcripts_info_json = transcripts_info_field_json 11373 11374 # Transcripts info format 11375 if transcripts_info_format is None: 11376 transcripts_info_format = param.get("transcripts", {}).get( 11377 "transcripts_info_format", transcripts_info_format_default 11378 ) 11379 11380 # Transcripts info field FORMAT 11381 if transcripts_info_field_format is None: 11382 transcripts_info_field_format = param.get("transcripts", {}).get( 11383 "transcripts_info_field_format", transcripts_info_field_format_default 11384 ) 11385 # if ( 11386 # transcripts_info_field_format is not None 11387 # and transcripts_info_format is None 11388 # ): 11389 # transcripts_info_format = transcripts_info_field_format 11390 11391 # Variants table 11392 table_variants = self.get_table_variants() 11393 11394 # Check info columns param 11395 if ( 11396 transcripts_info_json is None 11397 and transcripts_info_field_json is None 11398 and transcripts_info_format is None 11399 and transcripts_info_field_format is None 11400 ): 11401 return False 11402 11403 # Transcripts infos columns 11404 query_transcripts_infos_columns = f""" 11405 SELECT * 11406 FROM ( 11407 DESCRIBE SELECT * FROM {transcripts_table} 11408 ) 11409 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11410 """ 11411 transcripts_infos_columns = list( 11412 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11413 ) 11414 11415 # View results 11416 clause_select = [] 11417 clause_to_json = [] 11418 clause_to_format = [] 11419 for field in transcripts_infos_columns: 11420 # Do not consider INFO field for export into fields 11421 if field not in ["INFO"]: 11422 clause_select.append( 11423 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11424 ) 11425 clause_to_json.append(f""" '{field}': "{field}" """) 11426 clause_to_format.append(f""" "{field}" """) 11427 11428 # Update 11429 update_set_json = [] 11430 update_set_format = [] 11431 11432 # VCF header 11433 vcf_reader = self.get_header() 11434 11435 # Transcripts to info column in JSON 11436 if transcripts_info_json: 11437 11438 # Create column on variants table 11439 self.add_column( 11440 table_name=table_variants, 11441 column_name=transcripts_info_json, 11442 column_type="JSON", 11443 default_value=None, 11444 drop=False, 11445 ) 11446 11447 # Add header 11448 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11449 transcripts_info_json, 11450 ".", 11451 "String", 11452 "Transcripts in JSON format", 11453 "unknwon", 11454 "unknwon", 11455 self.code_type_map["String"], 11456 ) 11457 11458 # Add to update 11459 update_set_json.append( 11460 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11461 ) 11462 11463 # Transcripts to info field in JSON 11464 if transcripts_info_field_json: 11465 11466 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11467 11468 # Add to update 11469 update_set_json.append( 11470 f""" 11471 INFO = concat( 11472 CASE 11473 WHEN INFO NOT IN ('', '.') 11474 THEN INFO 11475 ELSE '' 11476 END, 11477 CASE 11478 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11479 THEN concat( 11480 ';{transcripts_info_field_json}=', 11481 t.{transcripts_info_json} 11482 ) 11483 ELSE '' 11484 END 11485 ) 11486 """ 11487 ) 11488 11489 # Add header 11490 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11491 transcripts_info_field_json, 11492 ".", 11493 "String", 11494 "Transcripts in JSON format", 11495 "unknwon", 11496 "unknwon", 11497 self.code_type_map["String"], 11498 ) 11499 11500 if update_set_json: 11501 11502 # Update query 11503 query_update = f""" 11504 UPDATE {table_variants} 11505 SET {", ".join(update_set_json)} 11506 FROM 11507 ( 11508 SELECT 11509 "#CHROM", POS, REF, ALT, 11510 concat( 11511 '{{', 11512 string_agg( 11513 '"' || "{transcripts_column_id}" || '":' || 11514 to_json(json_output) 11515 ), 11516 '}}' 11517 )::JSON AS {transcripts_info_json} 11518 FROM 11519 ( 11520 SELECT 11521 "#CHROM", POS, REF, ALT, 11522 "{transcripts_column_id}", 11523 to_json( 11524 {{{",".join(clause_to_json)}}} 11525 )::JSON AS json_output 11526 FROM 11527 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11528 WHERE "{transcripts_column_id}" IS NOT NULL 11529 ) 11530 GROUP BY "#CHROM", POS, REF, ALT 11531 ) AS t 11532 WHERE {table_variants}."#CHROM" = t."#CHROM" 11533 AND {table_variants}."POS" = t."POS" 11534 AND {table_variants}."REF" = t."REF" 11535 AND {table_variants}."ALT" = t."ALT" 11536 """ 11537 11538 self.execute_query(query=query_update) 11539 11540 # Transcripts to info column in FORMAT 11541 if transcripts_info_format: 11542 11543 # Create column on variants table 11544 self.add_column( 11545 table_name=table_variants, 11546 column_name=transcripts_info_format, 11547 column_type="VARCHAR", 11548 default_value=None, 11549 drop=False, 11550 ) 11551 11552 # Add header 11553 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11554 transcripts_info_format, 11555 ".", 11556 "String", 11557 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11558 "unknwon", 11559 "unknwon", 11560 self.code_type_map["String"], 11561 ) 11562 11563 # Add to update 11564 update_set_format.append( 11565 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11566 ) 11567 11568 else: 11569 11570 # Set variable for internal queries 11571 transcripts_info_format = "transcripts_info_format" 11572 11573 # Transcripts to info field in JSON 11574 if transcripts_info_field_format: 11575 11576 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11577 11578 # Add to update 11579 update_set_format.append( 11580 f""" 11581 INFO = concat( 11582 CASE 11583 WHEN INFO NOT IN ('', '.') 11584 THEN INFO 11585 ELSE '' 11586 END, 11587 CASE 11588 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11589 THEN concat( 11590 ';{transcripts_info_field_format}=', 11591 t.{transcripts_info_format} 11592 ) 11593 ELSE '' 11594 END 11595 ) 11596 """ 11597 ) 11598 11599 # Add header 11600 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11601 transcripts_info_field_format, 11602 ".", 11603 "String", 11604 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11605 "unknwon", 11606 "unknwon", 11607 self.code_type_map["String"], 11608 ) 11609 11610 if update_set_format: 11611 11612 # Update query 11613 query_update = f""" 11614 UPDATE {table_variants} 11615 SET {", ".join(update_set_format)} 11616 FROM 11617 ( 11618 SELECT 11619 "#CHROM", POS, REF, ALT, 11620 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11621 FROM 11622 ( 11623 SELECT 11624 "#CHROM", POS, REF, ALT, 11625 "{transcripts_column_id}", 11626 concat( 11627 "{transcripts_column_id}", 11628 '|', 11629 {", '|', ".join(clause_to_format)} 11630 ) AS {transcripts_info_format} 11631 FROM 11632 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11633 ) 11634 GROUP BY "#CHROM", POS, REF, ALT 11635 ) AS t 11636 WHERE {table_variants}."#CHROM" = t."#CHROM" 11637 AND {table_variants}."POS" = t."POS" 11638 AND {table_variants}."REF" = t."REF" 11639 AND {table_variants}."ALT" = t."ALT" 11640 """ 11641 11642 self.execute_query(query=query_update) 11643 11644 return True 11645 11646 def rename_info_fields( 11647 self, fields_to_rename: dict = None, table: str = None 11648 ) -> dict: 11649 """ 11650 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11651 corresponding INFO fields in the variants table. 11652 11653 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11654 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11655 represent the original field names that need to be renamed, and the corresponding values 11656 represent the new names to which the fields should be 11657 :type fields_to_rename: dict 11658 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11659 the table in which the variants data is stored. This table contains information about genetic 11660 variants, and the function updates the corresponding INFO fields in this table when renaming 11661 specified fields in the VCF file header 11662 :type table: str 11663 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11664 the original field names as keys and their corresponding new names (or None if the field was 11665 removed) as values after renaming or removing specified fields in a VCF file header and updating 11666 corresponding INFO fields in the variants table. 11667 """ 11668 11669 # Init 11670 fields_renamed = {} 11671 config = self.get_config() 11672 access = config.get("access") 11673 11674 if table is None: 11675 table = self.get_table_variants() 11676 11677 if fields_to_rename is not None and access not in ["RO"]: 11678 11679 log.info("Rename or remove fields...") 11680 11681 # Header 11682 header = self.get_header() 11683 11684 for field_to_rename, field_renamed in fields_to_rename.items(): 11685 11686 if field_to_rename in header.infos: 11687 11688 # Rename header 11689 if field_renamed is not None: 11690 header.infos[field_renamed] = vcf.parser._Info( 11691 field_renamed, 11692 header.infos[field_to_rename].num, 11693 header.infos[field_to_rename].type, 11694 header.infos[field_to_rename].desc, 11695 header.infos[field_to_rename].source, 11696 header.infos[field_to_rename].version, 11697 header.infos[field_to_rename].type_code, 11698 ) 11699 del header.infos[field_to_rename] 11700 11701 # Rename INFO patterns 11702 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11703 if field_renamed is not None: 11704 field_renamed_pattern = rf'\1{field_renamed}=\3' 11705 else: 11706 field_renamed_pattern = '' 11707 11708 # Rename INFO 11709 query = f""" 11710 UPDATE {table} 11711 SET 11712 INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g') 11713 """ 11714 self.execute_query(query=query) 11715 11716 # Return 11717 fields_renamed[field_to_rename] = field_renamed 11718 11719 # Log 11720 if field_renamed is not None: 11721 log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'") 11722 else: 11723 log.info(f"Rename or remove fields: field '{field_to_rename}' removed") 11724 11725 return fields_renamed 11726 11727 def calculation_rename_info_fields( 11728 self, 11729 fields_to_rename: dict = None, 11730 table: str = None, 11731 operation_name: str = "RENAME_INFO_FIELDS", 11732 ) -> None: 11733 """ 11734 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11735 fields to rename and table if provided, and then calls another function to rename the fields. 11736 11737 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11738 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11739 the key and the new field name as the value 11740 :type fields_to_rename: dict 11741 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11742 specify the name of the table for which the fields are to be renamed. It is a string type 11743 parameter 11744 :type table: str 11745 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11746 method is a string that specifies the name of the operation being performed. In this context, it 11747 is used as a default value for the operation name if not explicitly provided when calling the 11748 function, defaults to RENAME_INFO_FIELDS 11749 :type operation_name: str (optional) 11750 """ 11751 11752 # Param 11753 param = self.get_param() 11754 11755 # Get param fields to rename 11756 param_fields_to_rename = ( 11757 param.get("calculation", {}) 11758 .get("calculations", {}) 11759 .get(operation_name, {}) 11760 .get("fields_to_rename", None) 11761 ) 11762 11763 # Get param table 11764 param_table = ( 11765 param.get("calculation", {}) 11766 .get("calculations", {}) 11767 .get(operation_name, {}) 11768 .get("table", None) 11769 ) 11770 11771 # Init fields_to_rename 11772 if fields_to_rename is None: 11773 fields_to_rename = param_fields_to_rename 11774 11775 # Init table 11776 if table is None: 11777 table = param_table 11778 11779 renamed_fields = self.rename_info_fields( 11780 fields_to_rename=fields_to_rename, table=table 11781 ) 11782 11783 log.debug(f"renamed_fields:{renamed_fields}")
38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None
The function prints the input, output, config, and dataframe of the current object
570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config
It returns the config
Returns
The config variable is being returned.
993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param
It returns the param
Returns
The param variable is being returned.
1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn
It returns the connection object
Returns
The connection object.
1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 )
The export_output function exports data from a VCF file to various formats, including VCF,
CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
partitioning.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True - query: The
queryparameter in theexport_outputfunction is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage - threads: The
threadsparameter in theexport_outputfunction specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads - sort: The
sortparameter in theexport_outputfunction is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. Ifsortis set toTrue, the output file will be sorted. Ifsortis set toFalse,, defaults to False - index: The
indexparameter in theexport_outputfunction is a boolean flag that determines whether an index should be created on the output file. Ifindexis set toTrue, an index will be created on the output file. Ifindexis set toFalse, no, defaults to False - order_by: The
order_byparameter in theexport_outputfunction is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be - fields_to_rename: The
fields_to_renameparameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns
The
export_outputfunction returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2518 else: 2519 samples_fields = "" 2520 log.debug(f"samples_fields: {samples_fields}") 2521 else: 2522 samples_fields = "" 2523 2524 # Where clause 2525 if where_clause is None: 2526 where_clause = "" 2527 2528 # Variants 2529 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2530 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2531 log.debug(f"sql_query_select={sql_query_select}") 2532 2533 return self.export_output( 2534 output_file=vcf_file, 2535 output_header=None, 2536 export_header=True, 2537 query=sql_query_select, 2538 parquet_partitions=None, 2539 chunk_size=config.get("chunk_size", None), 2540 threads=threads, 2541 sort=True, 2542 index=index, 2543 order_by=None, 2544 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2546 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2547 """ 2548 It takes a list of commands and runs them in parallel using the number of threads specified 2549 2550 :param commands: A list of commands to run 2551 :param threads: The number of threads to use, defaults to 1 (optional) 2552 """ 2553 2554 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2556 def get_threads(self, default: int = 1) -> int: 2557 """ 2558 This function returns the number of threads to use for a job, with a default value of 1 if not 2559 specified. 2560 2561 :param default: The `default` parameter in the `get_threads` method is used to specify the 2562 default number of threads to use if no specific value is provided. If no value is provided for 2563 the `threads` parameter in the configuration or input parameters, the `default` value will be 2564 used, defaults to 1 2565 :type default: int (optional) 2566 :return: the number of threads to use for the current job. 2567 """ 2568 2569 # Config 2570 config = self.get_config() 2571 2572 # Param 2573 param = self.get_param() 2574 2575 # Input threads 2576 input_thread = param.get("threads", config.get("threads", None)) 2577 2578 # Check threads 2579 if not input_thread: 2580 threads = default 2581 elif int(input_thread) <= 0: 2582 threads = os.cpu_count() 2583 else: 2584 threads = int(input_thread) 2585 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2587 def get_memory(self, default: str = None) -> str: 2588 """ 2589 This function retrieves the memory value from parameters or configuration with a default value 2590 if not found. 2591 2592 :param default: The `get_memory` function takes in a default value as a string parameter. This 2593 default value is used as a fallback in case the `memory` parameter is not provided in the 2594 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2595 the function 2596 :type default: str 2597 :return: The `get_memory` function returns a string value representing the memory parameter. If 2598 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2599 return the default value provided as an argument to the function. 2600 """ 2601 2602 # Config 2603 config = self.get_config() 2604 2605 # Param 2606 param = self.get_param() 2607 2608 # Input threads 2609 input_memory = param.get("memory", config.get("memory", None)) 2610 2611 # Check threads 2612 if input_memory: 2613 memory = input_memory 2614 else: 2615 memory = default 2616 2617 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2619 def update_from_vcf(self, vcf_file: str) -> None: 2620 """ 2621 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2622 2623 :param vcf_file: the path to the VCF file 2624 """ 2625 2626 connexion_format = self.get_connexion_format() 2627 2628 if connexion_format in ["duckdb"]: 2629 self.update_from_vcf_duckdb(vcf_file) 2630 elif connexion_format in ["sqlite"]: 2631 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2633 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2634 """ 2635 It takes a VCF file and updates the INFO column of the variants table in the database with the 2636 INFO column of the VCF file 2637 2638 :param vcf_file: the path to the VCF file 2639 """ 2640 2641 # varaints table 2642 table_variants = self.get_table_variants() 2643 2644 # Loading VCF into temporaire table 2645 skip = self.get_header_length(file=vcf_file) 2646 vcf_df = pd.read_csv( 2647 vcf_file, 2648 sep="\t", 2649 engine="c", 2650 skiprows=skip, 2651 header=0, 2652 low_memory=False, 2653 ) 2654 sql_query_update = f""" 2655 UPDATE {table_variants} as table_variants 2656 SET INFO = concat( 2657 CASE 2658 WHEN INFO NOT IN ('', '.') 2659 THEN INFO 2660 ELSE '' 2661 END, 2662 ( 2663 SELECT 2664 concat( 2665 CASE 2666 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2667 THEN ';' 2668 ELSE '' 2669 END 2670 , 2671 CASE 2672 WHEN table_parquet.INFO NOT IN ('','.') 2673 THEN table_parquet.INFO 2674 ELSE '' 2675 END 2676 ) 2677 FROM vcf_df as table_parquet 2678 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2679 AND table_parquet.\"POS\" = table_variants.\"POS\" 2680 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2681 AND table_parquet.\"REF\" = table_variants.\"REF\" 2682 AND table_parquet.INFO NOT IN ('','.') 2683 ) 2684 ) 2685 ; 2686 """ 2687 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2689 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2690 """ 2691 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2692 table, then updates the INFO column of the variants table with the INFO column of the temporary 2693 table 2694 2695 :param vcf_file: The path to the VCF file you want to update the database with 2696 """ 2697 2698 # Create a temporary table for the VCF 2699 table_vcf = "tmp_vcf" 2700 sql_create = ( 2701 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2702 ) 2703 self.conn.execute(sql_create) 2704 2705 # Loading VCF into temporaire table 2706 vcf_df = pd.read_csv( 2707 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2708 ) 2709 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2710 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2711 2712 # Update table 'variants' with VCF data 2713 # warning: CONCAT as || operator 2714 sql_query_update = f""" 2715 UPDATE variants as table_variants 2716 SET INFO = CASE 2717 WHEN INFO NOT IN ('', '.') 2718 THEN INFO 2719 ELSE '' 2720 END || 2721 ( 2722 SELECT 2723 CASE 2724 WHEN table_variants.INFO NOT IN ('','.') 2725 AND table_vcf.INFO NOT IN ('','.') 2726 THEN ';' 2727 ELSE '' 2728 END || 2729 CASE 2730 WHEN table_vcf.INFO NOT IN ('','.') 2731 THEN table_vcf.INFO 2732 ELSE '' 2733 END 2734 FROM {table_vcf} as table_vcf 2735 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2736 AND table_vcf.\"POS\" = table_variants.\"POS\" 2737 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2738 AND table_vcf.\"REF\" = table_variants.\"REF\" 2739 ) 2740 """ 2741 self.conn.execute(sql_query_update) 2742 2743 # Drop temporary table 2744 sql_drop = f"DROP TABLE {table_vcf}" 2745 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2747 def drop_variants_table(self) -> None: 2748 """ 2749 > This function drops the variants table 2750 """ 2751 2752 table_variants = self.get_table_variants() 2753 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2754 self.conn.execute(sql_table_variants)
This function drops the variants table
2756 def set_variant_id( 2757 self, variant_id_column: str = "variant_id", force: bool = None 2758 ) -> str: 2759 """ 2760 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2761 `#CHROM`, `POS`, `REF`, and `ALT` columns 2762 2763 :param variant_id_column: The name of the column to be created in the variants table, defaults 2764 to variant_id 2765 :type variant_id_column: str (optional) 2766 :param force: If True, the variant_id column will be created even if it already exists 2767 :type force: bool 2768 :return: The name of the column that contains the variant_id 2769 """ 2770 2771 # Assembly 2772 assembly = self.get_param().get( 2773 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2774 ) 2775 2776 # INFO/Tag prefix 2777 prefix = self.get_explode_infos_prefix() 2778 2779 # Explode INFO/SVTYPE 2780 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2781 2782 # variants table 2783 table_variants = self.get_table_variants() 2784 2785 # variant_id column 2786 if not variant_id_column: 2787 variant_id_column = "variant_id" 2788 2789 # Creta variant_id column 2790 if "variant_id" not in self.get_extra_infos() or force: 2791 2792 # Create column 2793 self.add_column( 2794 table_name=table_variants, 2795 column_name=variant_id_column, 2796 column_type="UBIGINT", 2797 default_value="0", 2798 ) 2799 2800 # Update column 2801 self.conn.execute( 2802 f""" 2803 UPDATE {table_variants} 2804 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2805 """ 2806 ) 2807 2808 # Remove added columns 2809 for added_column in added_columns: 2810 self.drop_column(column=added_column) 2811 2812 # return variant_id column name 2813 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2815 def get_variant_id_column( 2816 self, variant_id_column: str = "variant_id", force: bool = None 2817 ) -> str: 2818 """ 2819 This function returns the variant_id column name 2820 2821 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2822 defaults to variant_id 2823 :type variant_id_column: str (optional) 2824 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2825 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2826 if it is not already set, or if it is set 2827 :type force: bool 2828 :return: The variant_id column name. 2829 """ 2830 2831 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2837 def scan_databases( 2838 self, 2839 database_formats: list = ["parquet"], 2840 database_releases: list = ["current"], 2841 ) -> dict: 2842 """ 2843 The function `scan_databases` scans for available databases based on specified formats and 2844 releases. 2845 2846 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2847 of the databases to be scanned. In this case, the accepted format is "parquet" 2848 :type database_formats: list ["parquet"] 2849 :param database_releases: The `database_releases` parameter is a list that specifies the 2850 releases of the databases to be scanned. In the provided function, the default value for 2851 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2852 databases that are in the "current" 2853 :type database_releases: list 2854 :return: The function `scan_databases` returns a dictionary containing information about 2855 databases that match the specified formats and releases. 2856 """ 2857 2858 # Config 2859 config = self.get_config() 2860 2861 # Param 2862 param = self.get_param() 2863 2864 # Param - Assembly 2865 assembly = param.get("assembly", config.get("assembly", None)) 2866 if not assembly: 2867 assembly = DEFAULT_ASSEMBLY 2868 log.warning(f"Default assembly '{assembly}'") 2869 2870 # Scan for availabled databases 2871 log.info( 2872 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2873 ) 2874 databases_infos_dict = databases_infos( 2875 database_folder_releases=database_releases, 2876 database_formats=database_formats, 2877 assembly=assembly, 2878 config=config, 2879 ) 2880 log.info( 2881 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2882 ) 2883 2884 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2886 def annotation(self) -> None: 2887 """ 2888 It annotates the VCF file with the annotations specified in the config file. 2889 """ 2890 2891 # Config 2892 config = self.get_config() 2893 2894 # Param 2895 param = self.get_param() 2896 2897 # Param - Assembly 2898 assembly = param.get("assembly", config.get("assembly", None)) 2899 if not assembly: 2900 assembly = DEFAULT_ASSEMBLY 2901 log.warning(f"Default assembly '{assembly}'") 2902 2903 # annotations databases folders 2904 annotations_databases = set( 2905 config.get("folders", {}) 2906 .get("databases", {}) 2907 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2908 + config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("parquet", ["~/howard/databases/parquet/current"]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2914 ) 2915 2916 # Get param annotations 2917 if param.get("annotations", None) and isinstance( 2918 param.get("annotations", None), str 2919 ): 2920 log.debug(param.get("annotations", None)) 2921 param_annotation_list = param.get("annotations").split(",") 2922 else: 2923 param_annotation_list = [] 2924 2925 # Each tools param 2926 if param.get("annotation_parquet", None) != None: 2927 log.debug( 2928 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2929 ) 2930 if isinstance(param.get("annotation_parquet", None), list): 2931 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2932 else: 2933 param_annotation_list.append(param.get("annotation_parquet")) 2934 if param.get("annotation_snpsift", None) != None: 2935 if isinstance(param.get("annotation_snpsift", None), list): 2936 param_annotation_list.append( 2937 "snpsift:" 2938 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2939 ) 2940 else: 2941 param_annotation_list.append( 2942 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2943 ) 2944 if param.get("annotation_snpeff", None) != None: 2945 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2946 if param.get("annotation_bcftools", None) != None: 2947 if isinstance(param.get("annotation_bcftools", None), list): 2948 param_annotation_list.append( 2949 "bcftools:" 2950 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2951 ) 2952 else: 2953 param_annotation_list.append( 2954 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2955 ) 2956 if param.get("annotation_annovar", None) != None: 2957 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2958 if param.get("annotation_exomiser", None) != None: 2959 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2960 if param.get("annotation_splice", None) != None: 2961 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2962 2963 # Merge param annotations list 2964 param["annotations"] = ",".join(param_annotation_list) 2965 2966 # debug 2967 log.debug(f"param_annotations={param['annotations']}") 2968 2969 if param.get("annotations"): 2970 2971 # Log 2972 # log.info("Annotations - Check annotation parameters") 2973 2974 if not "annotation" in param: 2975 param["annotation"] = {} 2976 2977 # List of annotations parameters 2978 annotations_list_input = {} 2979 if isinstance(param.get("annotations", None), str): 2980 annotation_file_list = [ 2981 value for value in param.get("annotations", "").split(",") 2982 ] 2983 for annotation_file in annotation_file_list: 2984 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2985 else: 2986 annotations_list_input = param.get("annotations", {}) 2987 2988 log.info(f"Quick Annotations:") 2989 for annotation_key in list(annotations_list_input.keys()): 2990 log.info(f" {annotation_key}") 2991 2992 # List of annotations and associated fields 2993 annotations_list = {} 2994 2995 for annotation_file in annotations_list_input: 2996 2997 # Explode annotations if ALL 2998 if ( 2999 annotation_file.upper() == "ALL" 3000 or annotation_file.upper().startswith("ALL:") 3001 ): 3002 3003 # check ALL parameters (formats, releases) 3004 annotation_file_split = annotation_file.split(":") 3005 database_formats = "parquet" 3006 database_releases = "current" 3007 for annotation_file_option in annotation_file_split[1:]: 3008 database_all_options_split = annotation_file_option.split("=") 3009 if database_all_options_split[0] == "format": 3010 database_formats = database_all_options_split[1].split("+") 3011 if database_all_options_split[0] == "release": 3012 database_releases = database_all_options_split[1].split("+") 3013 3014 # Scan for availabled databases 3015 databases_infos_dict = self.scan_databases( 3016 database_formats=database_formats, 3017 database_releases=database_releases, 3018 ) 3019 3020 # Add found databases in annotation parameters 3021 for database_infos in databases_infos_dict.keys(): 3022 annotations_list[database_infos] = {"INFO": None} 3023 3024 else: 3025 annotations_list[annotation_file] = annotations_list_input[ 3026 annotation_file 3027 ] 3028 3029 # Check each databases 3030 if len(annotations_list): 3031 3032 log.info( 3033 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3034 ) 3035 3036 for annotation_file in annotations_list: 3037 3038 # Init 3039 annotations = annotations_list.get(annotation_file, None) 3040 3041 # Annotation snpEff 3042 if annotation_file.startswith("snpeff"): 3043 3044 log.debug(f"Quick Annotation snpEff") 3045 3046 if "snpeff" not in param["annotation"]: 3047 param["annotation"]["snpeff"] = {} 3048 3049 if "options" not in param["annotation"]["snpeff"]: 3050 param["annotation"]["snpeff"]["options"] = "" 3051 3052 # snpEff options in annotations 3053 param["annotation"]["snpeff"]["options"] = "".join( 3054 annotation_file.split(":")[1:] 3055 ) 3056 3057 # Annotation Annovar 3058 elif annotation_file.startswith("annovar"): 3059 3060 log.debug(f"Quick Annotation Annovar") 3061 3062 if "annovar" not in param["annotation"]: 3063 param["annotation"]["annovar"] = {} 3064 3065 if "annotations" not in param["annotation"]["annovar"]: 3066 param["annotation"]["annovar"]["annotations"] = {} 3067 3068 # Options 3069 annotation_file_split = annotation_file.split(":") 3070 for annotation_file_annotation in annotation_file_split[1:]: 3071 if annotation_file_annotation: 3072 param["annotation"]["annovar"]["annotations"][ 3073 annotation_file_annotation 3074 ] = annotations 3075 3076 # Annotation Exomiser 3077 elif annotation_file.startswith("exomiser"): 3078 3079 log.debug(f"Quick Annotation Exomiser") 3080 3081 param["annotation"]["exomiser"] = params_string_to_dict( 3082 annotation_file 3083 ) 3084 3085 # Annotation Splice 3086 elif annotation_file.startswith("splice"): 3087 3088 log.debug(f"Quick Annotation Splice") 3089 3090 param["annotation"]["splice"] = params_string_to_dict( 3091 annotation_file 3092 ) 3093 3094 # Annotation Parquet or BCFTOOLS 3095 else: 3096 3097 # Tools detection 3098 if annotation_file.startswith("bcftools:"): 3099 annotation_tool_initial = "bcftools" 3100 annotation_file = ":".join(annotation_file.split(":")[1:]) 3101 elif annotation_file.startswith("snpsift:"): 3102 annotation_tool_initial = "snpsift" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("bigwig:"): 3105 annotation_tool_initial = "bigwig" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 else: 3108 annotation_tool_initial = None 3109 3110 # list of files 3111 annotation_file_list = annotation_file.replace("+", ":").split( 3112 ":" 3113 ) 3114 3115 for annotation_file in annotation_file_list: 3116 3117 if annotation_file: 3118 3119 # Annotation tool initial 3120 annotation_tool = annotation_tool_initial 3121 3122 # Find file 3123 annotation_file_found = None 3124 3125 if os.path.exists(annotation_file): 3126 annotation_file_found = annotation_file 3127 elif os.path.exists(full_path(annotation_file)): 3128 annotation_file_found = full_path(annotation_file) 3129 else: 3130 # Find within assembly folders 3131 for annotations_database in annotations_databases: 3132 found_files = find_all( 3133 annotation_file, 3134 os.path.join( 3135 annotations_database, assembly 3136 ), 3137 ) 3138 if len(found_files) > 0: 3139 annotation_file_found = found_files[0] 3140 break 3141 if not annotation_file_found and not assembly: 3142 # Find within folders 3143 for ( 3144 annotations_database 3145 ) in annotations_databases: 3146 found_files = find_all( 3147 annotation_file, annotations_database 3148 ) 3149 if len(found_files) > 0: 3150 annotation_file_found = found_files[0] 3151 break 3152 log.debug( 3153 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3154 ) 3155 3156 # Full path 3157 annotation_file_found = full_path(annotation_file_found) 3158 3159 if annotation_file_found: 3160 3161 database = Database(database=annotation_file_found) 3162 quick_annotation_format = database.get_format() 3163 quick_annotation_is_compressed = ( 3164 database.is_compressed() 3165 ) 3166 quick_annotation_is_indexed = os.path.exists( 3167 f"{annotation_file_found}.tbi" 3168 ) 3169 bcftools_preference = False 3170 3171 # Check Annotation Tool 3172 if not annotation_tool: 3173 if ( 3174 bcftools_preference 3175 and quick_annotation_format 3176 in ["vcf", "bed"] 3177 and quick_annotation_is_compressed 3178 and quick_annotation_is_indexed 3179 ): 3180 annotation_tool = "bcftools" 3181 elif quick_annotation_format in [ 3182 "vcf", 3183 "bed", 3184 "tsv", 3185 "tsv", 3186 "csv", 3187 "json", 3188 "tbl", 3189 "parquet", 3190 "duckdb", 3191 ]: 3192 annotation_tool = "parquet" 3193 elif quick_annotation_format in ["bw"]: 3194 annotation_tool = "bigwig" 3195 else: 3196 log.error( 3197 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3198 ) 3199 raise ValueError( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 3203 log.debug( 3204 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3205 ) 3206 3207 # Annotation Tool dispatch 3208 if annotation_tool: 3209 if annotation_tool not in param["annotation"]: 3210 param["annotation"][annotation_tool] = {} 3211 if ( 3212 "annotations" 3213 not in param["annotation"][annotation_tool] 3214 ): 3215 param["annotation"][annotation_tool][ 3216 "annotations" 3217 ] = {} 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ][annotation_file_found] = annotations 3221 3222 else: 3223 log.warning( 3224 f"Quick Annotation File {annotation_file} does NOT exist" 3225 ) 3226 3227 self.set_param(param) 3228 3229 if param.get("annotation", None): 3230 log.info("Annotations") 3231 if param.get("annotation", {}).get("parquet", None): 3232 log.info("Annotations 'parquet'...") 3233 self.annotation_parquet() 3234 if param.get("annotation", {}).get("bcftools", None): 3235 log.info("Annotations 'bcftools'...") 3236 self.annotation_bcftools() 3237 if param.get("annotation", {}).get("snpsift", None): 3238 log.info("Annotations 'snpsift'...") 3239 self.annotation_snpsift() 3240 if param.get("annotation", {}).get("bigwig", None): 3241 log.info("Annotations 'bigwig'...") 3242 self.annotation_bigwig() 3243 if param.get("annotation", {}).get("annovar", None): 3244 log.info("Annotations 'annovar'...") 3245 self.annotation_annovar() 3246 if param.get("annotation", {}).get("snpeff", None): 3247 log.info("Annotations 'snpeff'...") 3248 self.annotation_snpeff() 3249 if param.get("annotation", {}).get("exomiser", None) is not None: 3250 log.info("Annotations 'exomiser'...") 3251 self.annotation_exomiser() 3252 if param.get("annotation", {}).get("splice", None) is not None: 3253 log.info("Annotations 'splice' ...") 3254 self.annotation_splice() 3255 3256 # Explode INFOS fields into table fields 3257 if self.get_explode_infos(): 3258 self.explode_infos( 3259 prefix=self.get_explode_infos_prefix(), 3260 fields=self.get_explode_infos_fields(), 3261 force=True, 3262 )
It annotates the VCF file with the annotations specified in the config file.
3264 def annotation_bigwig(self, threads: int = None) -> None: 3265 """ 3266 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3267 3268 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3269 number of threads to be used for parallel processing during the annotation process. If the 3270 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3271 threads to use based on the system configuration 3272 :type threads: int 3273 :return: True 3274 """ 3275 3276 # DEBUG 3277 log.debug("Start annotation with bigwig databases") 3278 3279 # # Threads 3280 # if not threads: 3281 # threads = self.get_threads() 3282 # log.debug("Threads: " + str(threads)) 3283 3284 # Config 3285 config = self.get_config() 3286 log.debug("Config: " + str(config)) 3287 3288 # Config - BCFTools databases folders 3289 databases_folders = set( 3290 self.get_config() 3291 .get("folders", {}) 3292 .get("databases", {}) 3293 .get("annotations", ["."]) 3294 + self.get_config() 3295 .get("folders", {}) 3296 .get("databases", {}) 3297 .get("bigwig", ["."]) 3298 ) 3299 log.debug("Databases annotations: " + str(databases_folders)) 3300 3301 # Param 3302 annotations = ( 3303 self.get_param() 3304 .get("annotation", {}) 3305 .get("bigwig", {}) 3306 .get("annotations", None) 3307 ) 3308 log.debug("Annotations: " + str(annotations)) 3309 3310 # Assembly 3311 assembly = self.get_param().get( 3312 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3313 ) 3314 3315 # Data 3316 table_variants = self.get_table_variants() 3317 3318 # Check if not empty 3319 log.debug("Check if not empty") 3320 sql_query_chromosomes = ( 3321 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3322 ) 3323 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3324 if not sql_query_chromosomes_df["count"][0]: 3325 log.info(f"VCF empty") 3326 return 3327 3328 # VCF header 3329 vcf_reader = self.get_header() 3330 log.debug("Initial header: " + str(vcf_reader.infos)) 3331 3332 # Existing annotations 3333 for vcf_annotation in self.get_header().infos: 3334 3335 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3336 log.debug( 3337 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3338 ) 3339 3340 if annotations: 3341 3342 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3343 3344 # Export VCF file 3345 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3346 3347 # annotation_bigwig_config 3348 annotation_bigwig_config_list = [] 3349 3350 for annotation in annotations: 3351 annotation_fields = annotations[annotation] 3352 3353 # Annotation Name 3354 annotation_name = os.path.basename(annotation) 3355 3356 if not annotation_fields: 3357 annotation_fields = {"INFO": None} 3358 3359 log.debug(f"Annotation '{annotation_name}'") 3360 log.debug( 3361 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3362 ) 3363 3364 # Create Database 3365 database = Database( 3366 database=annotation, 3367 databases_folders=databases_folders, 3368 assembly=assembly, 3369 ) 3370 3371 # Find files 3372 db_file = database.get_database() 3373 db_file = full_path(db_file) 3374 db_hdr_file = database.get_header_file() 3375 db_hdr_file = full_path(db_hdr_file) 3376 db_file_type = database.get_format() 3377 3378 # If db_file is http ? 3379 if database.get_database().startswith("http"): 3380 3381 # Datbase is HTTP URL 3382 db_file_is_http = True 3383 3384 # DB file keep as URL 3385 db_file = database.get_database() 3386 log.warning( 3387 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3388 ) 3389 3390 # Retrieve automatic annotation field name 3391 annotation_field = clean_annotation_field( 3392 os.path.basename(db_file).replace(".bw", "") 3393 ) 3394 log.debug( 3395 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3396 ) 3397 3398 # Create automatic header file 3399 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3400 with open(db_hdr_file, "w") as f: 3401 f.write("##fileformat=VCFv4.2\n") 3402 f.write( 3403 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3404 ) 3405 f.write(f"#CHROM START END {annotation_field}\n") 3406 3407 else: 3408 3409 # Datbase is NOT HTTP URL 3410 db_file_is_http = False 3411 3412 # Check index - try to create if not exists 3413 if ( 3414 db_file is None 3415 or db_hdr_file is None 3416 or (not os.path.exists(db_file) and not db_file_is_http) 3417 or not os.path.exists(db_hdr_file) 3418 or not db_file_type in ["bw"] 3419 ): 3420 # if False: 3421 log.error("Annotation failed: database not valid") 3422 log.error(f"Annotation annotation file: {db_file}") 3423 log.error(f"Annotation annotation file type: {db_file_type}") 3424 log.error(f"Annotation annotation header: {db_hdr_file}") 3425 raise ValueError( 3426 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3427 ) 3428 else: 3429 3430 # Log 3431 log.debug( 3432 f"Annotation '{annotation}' - file: " 3433 + str(db_file) 3434 + " and " 3435 + str(db_hdr_file) 3436 ) 3437 3438 # Load header as VCF object 3439 db_hdr_vcf = Variants(input=db_hdr_file) 3440 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3441 log.debug( 3442 "Annotation database header: " 3443 + str(db_hdr_vcf_header_infos) 3444 ) 3445 3446 # For all fields in database 3447 annotation_fields_full = False 3448 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3449 annotation_fields = { 3450 key: key for key in db_hdr_vcf_header_infos 3451 } 3452 log.debug( 3453 "Annotation database header - All annotations added: " 3454 + str(annotation_fields) 3455 ) 3456 annotation_fields_full = True 3457 3458 # Init 3459 cyvcf2_header_rename_dict = {} 3460 cyvcf2_header_list = [] 3461 cyvcf2_header_indexes = {} 3462 3463 # process annotation fields 3464 for annotation_field in annotation_fields: 3465 3466 # New annotation name 3467 annotation_field_new = annotation_fields[annotation_field] 3468 3469 # Check annotation field and index in header 3470 if ( 3471 annotation_field 3472 in db_hdr_vcf.get_header_columns_as_list() 3473 ): 3474 annotation_field_index = ( 3475 db_hdr_vcf.get_header_columns_as_list().index( 3476 annotation_field 3477 ) 3478 - 3 3479 ) 3480 cyvcf2_header_indexes[annotation_field_new] = ( 3481 annotation_field_index 3482 ) 3483 else: 3484 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Append annotation field in cyvcf2 header list 3489 cyvcf2_header_rename_dict[annotation_field_new] = ( 3490 db_hdr_vcf_header_infos[annotation_field].id 3491 ) 3492 cyvcf2_header_list.append( 3493 { 3494 "ID": annotation_field_new, 3495 "Number": db_hdr_vcf_header_infos[ 3496 annotation_field 3497 ].num, 3498 "Type": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].type, 3501 "Description": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].desc, 3504 } 3505 ) 3506 3507 # Add header on VCF 3508 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3509 annotation_field_new, 3510 db_hdr_vcf_header_infos[annotation_field].num, 3511 db_hdr_vcf_header_infos[annotation_field].type, 3512 db_hdr_vcf_header_infos[annotation_field].desc, 3513 "HOWARD BigWig annotation", 3514 "unknown", 3515 self.code_type_map[ 3516 db_hdr_vcf_header_infos[annotation_field].type 3517 ], 3518 ) 3519 3520 # Load bigwig database 3521 bw_db = pyBigWig.open(db_file) 3522 if bw_db.isBigWig(): 3523 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3524 else: 3525 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3526 log.error(msg_err) 3527 raise ValueError(msg_err) 3528 3529 annotation_bigwig_config_list.append( 3530 { 3531 "db_file": db_file, 3532 "bw_db": bw_db, 3533 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3534 "cyvcf2_header_list": cyvcf2_header_list, 3535 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3536 } 3537 ) 3538 3539 # Annotate 3540 if annotation_bigwig_config_list: 3541 3542 # Annotation config 3543 log.debug( 3544 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3545 ) 3546 3547 # Export VCF file 3548 self.export_variant_vcf( 3549 vcf_file=tmp_vcf_name, 3550 remove_info=True, 3551 add_samples=False, 3552 index=True, 3553 ) 3554 3555 # Load input tmp file 3556 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3557 3558 # Add header in input file 3559 for annotation_bigwig_config in annotation_bigwig_config_list: 3560 for cyvcf2_header_field in annotation_bigwig_config.get( 3561 "cyvcf2_header_list", [] 3562 ): 3563 log.info( 3564 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3565 ) 3566 input_vcf.add_info_to_header(cyvcf2_header_field) 3567 3568 # Create output VCF file 3569 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3570 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3571 3572 # Fetch variants 3573 log.info(f"Annotations 'bigwig' start...") 3574 for variant in input_vcf: 3575 3576 for annotation_bigwig_config in annotation_bigwig_config_list: 3577 3578 # DB and indexes 3579 bw_db = annotation_bigwig_config.get("bw_db", None) 3580 cyvcf2_header_indexes = annotation_bigwig_config.get( 3581 "cyvcf2_header_indexes", None 3582 ) 3583 3584 # Retrieve value from chrom pos 3585 res = bw_db.values( 3586 variant.CHROM, variant.POS - 1, variant.POS 3587 ) 3588 3589 # For each annotation fields (and indexes) 3590 for cyvcf2_header_index in cyvcf2_header_indexes: 3591 3592 # If value is NOT nNone 3593 if not np.isnan( 3594 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3595 ): 3596 variant.INFO[cyvcf2_header_index] = res[ 3597 cyvcf2_header_indexes[cyvcf2_header_index] 3598 ] 3599 3600 # Add record in output file 3601 output_vcf.write_record(variant) 3602 3603 # Log 3604 log.debug(f"Annotation done.") 3605 3606 # Close and write file 3607 log.info(f"Annotations 'bigwig' write...") 3608 output_vcf.close() 3609 log.debug(f"Write done.") 3610 3611 # Update variants 3612 log.info(f"Annotations 'bigwig' update...") 3613 self.update_from_vcf(output_vcf_file) 3614 log.debug(f"Update done.") 3615 3616 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3618 def annotation_snpsift(self, threads: int = None) -> None: 3619 """ 3620 This function annotate with bcftools 3621 3622 :param threads: Number of threads to use 3623 :return: the value of the variable "return_value". 3624 """ 3625 3626 # DEBUG 3627 log.debug("Start annotation with bcftools databases") 3628 3629 # Threads 3630 if not threads: 3631 threads = self.get_threads() 3632 log.debug("Threads: " + str(threads)) 3633 3634 # Config 3635 config = self.get_config() 3636 log.debug("Config: " + str(config)) 3637 3638 # Config - snpSift 3639 snpsift_bin_command = get_bin_command( 3640 bin="SnpSift.jar", 3641 tool="snpsift", 3642 bin_type="jar", 3643 config=config, 3644 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3645 ) 3646 if not snpsift_bin_command: 3647 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3648 log.error(msg_err) 3649 raise ValueError(msg_err) 3650 3651 # Config - bcftools 3652 bcftools_bin_command = get_bin_command( 3653 bin="bcftools", 3654 tool="bcftools", 3655 bin_type="bin", 3656 config=config, 3657 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3658 ) 3659 if not bcftools_bin_command: 3660 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3661 log.error(msg_err) 3662 raise ValueError(msg_err) 3663 3664 # Config - BCFTools databases folders 3665 databases_folders = set( 3666 self.get_config() 3667 .get("folders", {}) 3668 .get("databases", {}) 3669 .get("annotations", ["."]) 3670 + self.get_config() 3671 .get("folders", {}) 3672 .get("databases", {}) 3673 .get("bcftools", ["."]) 3674 ) 3675 log.debug("Databases annotations: " + str(databases_folders)) 3676 3677 # Param 3678 annotations = ( 3679 self.get_param() 3680 .get("annotation", {}) 3681 .get("snpsift", {}) 3682 .get("annotations", None) 3683 ) 3684 log.debug("Annotations: " + str(annotations)) 3685 3686 # Assembly 3687 assembly = self.get_param().get( 3688 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3689 ) 3690 3691 # Data 3692 table_variants = self.get_table_variants() 3693 3694 # Check if not empty 3695 log.debug("Check if not empty") 3696 sql_query_chromosomes = ( 3697 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3698 ) 3699 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3700 if not sql_query_chromosomes_df["count"][0]: 3701 log.info(f"VCF empty") 3702 return 3703 3704 # VCF header 3705 vcf_reader = self.get_header() 3706 log.debug("Initial header: " + str(vcf_reader.infos)) 3707 3708 # Existing annotations 3709 for vcf_annotation in self.get_header().infos: 3710 3711 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3712 log.debug( 3713 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3714 ) 3715 3716 if annotations: 3717 3718 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3719 3720 # Export VCF file 3721 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3722 3723 # Init 3724 commands = {} 3725 3726 for annotation in annotations: 3727 annotation_fields = annotations[annotation] 3728 3729 # Annotation Name 3730 annotation_name = os.path.basename(annotation) 3731 3732 if not annotation_fields: 3733 annotation_fields = {"INFO": None} 3734 3735 log.debug(f"Annotation '{annotation_name}'") 3736 log.debug( 3737 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3738 ) 3739 3740 # Create Database 3741 database = Database( 3742 database=annotation, 3743 databases_folders=databases_folders, 3744 assembly=assembly, 3745 ) 3746 3747 # Find files 3748 db_file = database.get_database() 3749 db_file = full_path(db_file) 3750 db_hdr_file = database.get_header_file() 3751 db_hdr_file = full_path(db_hdr_file) 3752 db_file_type = database.get_format() 3753 db_tbi_file = f"{db_file}.tbi" 3754 db_file_compressed = database.is_compressed() 3755 3756 # Check if compressed 3757 if not db_file_compressed: 3758 log.error( 3759 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3760 ) 3761 raise ValueError( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 3765 # Check if indexed 3766 if not os.path.exists(db_tbi_file): 3767 log.error( 3768 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3769 ) 3770 raise ValueError( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 3774 # Check index - try to create if not exists 3775 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3776 log.error("Annotation failed: database not valid") 3777 log.error(f"Annotation annotation file: {db_file}") 3778 log.error(f"Annotation annotation header: {db_hdr_file}") 3779 log.error(f"Annotation annotation index: {db_tbi_file}") 3780 raise ValueError( 3781 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3782 ) 3783 else: 3784 3785 log.debug( 3786 f"Annotation '{annotation}' - file: " 3787 + str(db_file) 3788 + " and " 3789 + str(db_hdr_file) 3790 ) 3791 3792 # Load header as VCF object 3793 db_hdr_vcf = Variants(input=db_hdr_file) 3794 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3795 log.debug( 3796 "Annotation database header: " 3797 + str(db_hdr_vcf_header_infos) 3798 ) 3799 3800 # For all fields in database 3801 annotation_fields_full = False 3802 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3803 annotation_fields = { 3804 key: key for key in db_hdr_vcf_header_infos 3805 } 3806 log.debug( 3807 "Annotation database header - All annotations added: " 3808 + str(annotation_fields) 3809 ) 3810 annotation_fields_full = True 3811 3812 # # Create file for field rename 3813 # log.debug("Create file for field rename") 3814 # tmp_rename = NamedTemporaryFile( 3815 # prefix=self.get_prefix(), 3816 # dir=self.get_tmp_dir(), 3817 # suffix=".rename", 3818 # delete=False, 3819 # ) 3820 # tmp_rename_name = tmp_rename.name 3821 # tmp_files.append(tmp_rename_name) 3822 3823 # Number of fields 3824 nb_annotation_field = 0 3825 annotation_list = [] 3826 annotation_infos_rename_list = [] 3827 3828 for annotation_field in annotation_fields: 3829 3830 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3831 annotation_fields_new_name = annotation_fields.get( 3832 annotation_field, annotation_field 3833 ) 3834 if not annotation_fields_new_name: 3835 annotation_fields_new_name = annotation_field 3836 3837 # Check if field is in DB and if field is not elready in input data 3838 if ( 3839 annotation_field in db_hdr_vcf.get_header().infos 3840 and annotation_fields_new_name 3841 not in self.get_header().infos 3842 ): 3843 3844 log.info( 3845 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3846 ) 3847 3848 # BCFTools annotate param to rename fields 3849 if annotation_field != annotation_fields_new_name: 3850 annotation_infos_rename_list.append( 3851 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3852 ) 3853 3854 # Add INFO field to header 3855 db_hdr_vcf_header_infos_number = ( 3856 db_hdr_vcf_header_infos[annotation_field].num or "." 3857 ) 3858 db_hdr_vcf_header_infos_type = ( 3859 db_hdr_vcf_header_infos[annotation_field].type 3860 or "String" 3861 ) 3862 db_hdr_vcf_header_infos_description = ( 3863 db_hdr_vcf_header_infos[annotation_field].desc 3864 or f"{annotation_field} description" 3865 ) 3866 db_hdr_vcf_header_infos_source = ( 3867 db_hdr_vcf_header_infos[annotation_field].source 3868 or "unknown" 3869 ) 3870 db_hdr_vcf_header_infos_version = ( 3871 db_hdr_vcf_header_infos[annotation_field].version 3872 or "unknown" 3873 ) 3874 3875 vcf_reader.infos[annotation_fields_new_name] = ( 3876 vcf.parser._Info( 3877 annotation_fields_new_name, 3878 db_hdr_vcf_header_infos_number, 3879 db_hdr_vcf_header_infos_type, 3880 db_hdr_vcf_header_infos_description, 3881 db_hdr_vcf_header_infos_source, 3882 db_hdr_vcf_header_infos_version, 3883 self.code_type_map[ 3884 db_hdr_vcf_header_infos_type 3885 ], 3886 ) 3887 ) 3888 3889 annotation_list.append(annotation_field) 3890 3891 nb_annotation_field += 1 3892 3893 else: 3894 3895 if ( 3896 annotation_field 3897 not in db_hdr_vcf.get_header().infos 3898 ): 3899 log.warning( 3900 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3901 ) 3902 if ( 3903 annotation_fields_new_name 3904 in self.get_header().infos 3905 ): 3906 log.warning( 3907 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3908 ) 3909 3910 log.info( 3911 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3912 ) 3913 3914 annotation_infos = ",".join(annotation_list) 3915 3916 if annotation_infos != "": 3917 3918 # Annotated VCF (and error file) 3919 tmp_annotation_vcf_name = os.path.join( 3920 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3921 ) 3922 tmp_annotation_vcf_name_err = ( 3923 tmp_annotation_vcf_name + ".err" 3924 ) 3925 3926 # Add fields to annotate 3927 if not annotation_fields_full: 3928 annotation_infos_option = f"-info {annotation_infos}" 3929 else: 3930 annotation_infos_option = "" 3931 3932 # Info fields rename 3933 if annotation_infos_rename_list: 3934 annotation_infos_rename = " -c " + ",".join( 3935 annotation_infos_rename_list 3936 ) 3937 else: 3938 annotation_infos_rename = "" 3939 3940 # Annotate command 3941 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3942 3943 # Add command 3944 commands[command_annotate] = tmp_annotation_vcf_name 3945 3946 if commands: 3947 3948 # Export VCF file 3949 self.export_variant_vcf( 3950 vcf_file=tmp_vcf_name, 3951 remove_info=True, 3952 add_samples=False, 3953 index=True, 3954 ) 3955 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3956 3957 # Num command 3958 nb_command = 0 3959 3960 # Annotate 3961 for command_annotate in commands: 3962 nb_command += 1 3963 log.info( 3964 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3965 ) 3966 log.debug(f"command_annotate={command_annotate}") 3967 run_parallel_commands([command_annotate], threads) 3968 3969 # Debug 3970 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3971 3972 # Update variants 3973 log.info( 3974 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3975 ) 3976 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3978 def annotation_bcftools(self, threads: int = None) -> None: 3979 """ 3980 This function annotate with bcftools 3981 3982 :param threads: Number of threads to use 3983 :return: the value of the variable "return_value". 3984 """ 3985 3986 # DEBUG 3987 log.debug("Start annotation with bcftools databases") 3988 3989 # Threads 3990 if not threads: 3991 threads = self.get_threads() 3992 log.debug("Threads: " + str(threads)) 3993 3994 # Config 3995 config = self.get_config() 3996 log.debug("Config: " + str(config)) 3997 3998 # DEBUG 3999 delete_tmp = True 4000 if self.get_config().get("verbosity", "warning") in ["debug"]: 4001 delete_tmp = False 4002 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4003 4004 # Config - BCFTools bin command 4005 bcftools_bin_command = get_bin_command( 4006 bin="bcftools", 4007 tool="bcftools", 4008 bin_type="bin", 4009 config=config, 4010 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4011 ) 4012 if not bcftools_bin_command: 4013 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4014 log.error(msg_err) 4015 raise ValueError(msg_err) 4016 4017 # Config - BCFTools databases folders 4018 databases_folders = set( 4019 self.get_config() 4020 .get("folders", {}) 4021 .get("databases", {}) 4022 .get("annotations", ["."]) 4023 + self.get_config() 4024 .get("folders", {}) 4025 .get("databases", {}) 4026 .get("bcftools", ["."]) 4027 ) 4028 log.debug("Databases annotations: " + str(databases_folders)) 4029 4030 # Param 4031 annotations = ( 4032 self.get_param() 4033 .get("annotation", {}) 4034 .get("bcftools", {}) 4035 .get("annotations", None) 4036 ) 4037 log.debug("Annotations: " + str(annotations)) 4038 4039 # Assembly 4040 assembly = self.get_param().get( 4041 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4042 ) 4043 4044 # Data 4045 table_variants = self.get_table_variants() 4046 4047 # Check if not empty 4048 log.debug("Check if not empty") 4049 sql_query_chromosomes = ( 4050 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4051 ) 4052 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4053 if not sql_query_chromosomes_df["count"][0]: 4054 log.info(f"VCF empty") 4055 return 4056 4057 # Export in VCF 4058 log.debug("Create initial file to annotate") 4059 tmp_vcf = NamedTemporaryFile( 4060 prefix=self.get_prefix(), 4061 dir=self.get_tmp_dir(), 4062 suffix=".vcf.gz", 4063 delete=False, 4064 ) 4065 tmp_vcf_name = tmp_vcf.name 4066 4067 # VCF header 4068 vcf_reader = self.get_header() 4069 log.debug("Initial header: " + str(vcf_reader.infos)) 4070 4071 # Existing annotations 4072 for vcf_annotation in self.get_header().infos: 4073 4074 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4075 log.debug( 4076 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4077 ) 4078 4079 if annotations: 4080 4081 tmp_ann_vcf_list = [] 4082 commands = [] 4083 tmp_files = [] 4084 err_files = [] 4085 4086 for annotation in annotations: 4087 annotation_fields = annotations[annotation] 4088 4089 # Annotation Name 4090 annotation_name = os.path.basename(annotation) 4091 4092 if not annotation_fields: 4093 annotation_fields = {"INFO": None} 4094 4095 log.debug(f"Annotation '{annotation_name}'") 4096 log.debug( 4097 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4098 ) 4099 4100 # Create Database 4101 database = Database( 4102 database=annotation, 4103 databases_folders=databases_folders, 4104 assembly=assembly, 4105 ) 4106 4107 # Find files 4108 db_file = database.get_database() 4109 db_file = full_path(db_file) 4110 db_hdr_file = database.get_header_file() 4111 db_hdr_file = full_path(db_hdr_file) 4112 db_file_type = database.get_format() 4113 db_tbi_file = f"{db_file}.tbi" 4114 db_file_compressed = database.is_compressed() 4115 4116 # Check if compressed 4117 if not db_file_compressed: 4118 log.error( 4119 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4120 ) 4121 raise ValueError( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 4125 # Check if indexed 4126 if not os.path.exists(db_tbi_file): 4127 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4128 raise ValueError( 4129 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4130 ) 4131 4132 # Check index - try to create if not exists 4133 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4134 log.error("Annotation failed: database not valid") 4135 log.error(f"Annotation annotation file: {db_file}") 4136 log.error(f"Annotation annotation header: {db_hdr_file}") 4137 log.error(f"Annotation annotation index: {db_tbi_file}") 4138 raise ValueError( 4139 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4140 ) 4141 else: 4142 4143 log.debug( 4144 f"Annotation '{annotation}' - file: " 4145 + str(db_file) 4146 + " and " 4147 + str(db_hdr_file) 4148 ) 4149 4150 # Load header as VCF object 4151 db_hdr_vcf = Variants(input=db_hdr_file) 4152 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4153 log.debug( 4154 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4155 ) 4156 4157 # For all fields in database 4158 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4159 annotation_fields = { 4160 key: key for key in db_hdr_vcf_header_infos 4161 } 4162 log.debug( 4163 "Annotation database header - All annotations added: " 4164 + str(annotation_fields) 4165 ) 4166 4167 # Number of fields 4168 nb_annotation_field = 0 4169 annotation_list = [] 4170 4171 for annotation_field in annotation_fields: 4172 4173 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4174 annotation_fields_new_name = annotation_fields.get( 4175 annotation_field, annotation_field 4176 ) 4177 if not annotation_fields_new_name: 4178 annotation_fields_new_name = annotation_field 4179 4180 # Check if field is in DB and if field is not elready in input data 4181 if ( 4182 annotation_field in db_hdr_vcf.get_header().infos 4183 and annotation_fields_new_name 4184 not in self.get_header().infos 4185 ): 4186 4187 log.info( 4188 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4189 ) 4190 4191 # Add INFO field to header 4192 db_hdr_vcf_header_infos_number = ( 4193 db_hdr_vcf_header_infos[annotation_field].num or "." 4194 ) 4195 db_hdr_vcf_header_infos_type = ( 4196 db_hdr_vcf_header_infos[annotation_field].type 4197 or "String" 4198 ) 4199 db_hdr_vcf_header_infos_description = ( 4200 db_hdr_vcf_header_infos[annotation_field].desc 4201 or f"{annotation_field} description" 4202 ) 4203 db_hdr_vcf_header_infos_source = ( 4204 db_hdr_vcf_header_infos[annotation_field].source 4205 or "unknown" 4206 ) 4207 db_hdr_vcf_header_infos_version = ( 4208 db_hdr_vcf_header_infos[annotation_field].version 4209 or "unknown" 4210 ) 4211 4212 vcf_reader.infos[annotation_fields_new_name] = ( 4213 vcf.parser._Info( 4214 annotation_fields_new_name, 4215 db_hdr_vcf_header_infos_number, 4216 db_hdr_vcf_header_infos_type, 4217 db_hdr_vcf_header_infos_description, 4218 db_hdr_vcf_header_infos_source, 4219 db_hdr_vcf_header_infos_version, 4220 self.code_type_map[db_hdr_vcf_header_infos_type], 4221 ) 4222 ) 4223 4224 # annotation_list.append(annotation_field) 4225 if annotation_field != annotation_fields_new_name: 4226 annotation_list.append( 4227 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4228 ) 4229 else: 4230 annotation_list.append(annotation_field) 4231 4232 nb_annotation_field += 1 4233 4234 else: 4235 4236 if annotation_field not in db_hdr_vcf.get_header().infos: 4237 log.warning( 4238 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4239 ) 4240 if annotation_fields_new_name in self.get_header().infos: 4241 log.warning( 4242 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4243 ) 4244 4245 log.info( 4246 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4247 ) 4248 4249 annotation_infos = ",".join(annotation_list) 4250 4251 if annotation_infos != "": 4252 4253 # Protect header for bcftools (remove "#CHROM" and variants line) 4254 log.debug("Protect Header file - remove #CHROM line if exists") 4255 tmp_header_vcf = NamedTemporaryFile( 4256 prefix=self.get_prefix(), 4257 dir=self.get_tmp_dir(), 4258 suffix=".hdr", 4259 delete=False, 4260 ) 4261 tmp_header_vcf_name = tmp_header_vcf.name 4262 tmp_files.append(tmp_header_vcf_name) 4263 # Command 4264 if db_hdr_file.endswith(".gz"): 4265 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4266 else: 4267 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 # Run 4269 run_parallel_commands([command_extract_header], 1) 4270 4271 # Find chomosomes 4272 log.debug("Find chromosomes ") 4273 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4274 sql_query_chromosomes_df = self.get_query_to_df( 4275 sql_query_chromosomes 4276 ) 4277 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4278 4279 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4280 4281 # BED columns in the annotation file 4282 if db_file_type in ["bed"]: 4283 annotation_infos = "CHROM,POS,POS," + annotation_infos 4284 4285 for chrom in chomosomes_list: 4286 4287 # Create BED on initial VCF 4288 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4289 tmp_bed = NamedTemporaryFile( 4290 prefix=self.get_prefix(), 4291 dir=self.get_tmp_dir(), 4292 suffix=".bed", 4293 delete=False, 4294 ) 4295 tmp_bed_name = tmp_bed.name 4296 tmp_files.append(tmp_bed_name) 4297 4298 # Detecte regions 4299 log.debug( 4300 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4301 ) 4302 window = 1000000 4303 sql_query_intervals_for_bed = f""" 4304 SELECT \"#CHROM\", 4305 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4306 \"POS\"+{window} 4307 FROM {table_variants} as table_variants 4308 WHERE table_variants.\"#CHROM\" = '{chrom}' 4309 """ 4310 regions = self.conn.execute( 4311 sql_query_intervals_for_bed 4312 ).fetchall() 4313 merged_regions = merge_regions(regions) 4314 log.debug( 4315 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4316 ) 4317 4318 header = ["#CHROM", "START", "END"] 4319 with open(tmp_bed_name, "w") as f: 4320 # Write the header with tab delimiter 4321 f.write("\t".join(header) + "\n") 4322 for d in merged_regions: 4323 # Write each data row with tab delimiter 4324 f.write("\t".join(map(str, d)) + "\n") 4325 4326 # Tmp files 4327 tmp_annotation_vcf = NamedTemporaryFile( 4328 prefix=self.get_prefix(), 4329 dir=self.get_tmp_dir(), 4330 suffix=".vcf.gz", 4331 delete=False, 4332 ) 4333 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4334 tmp_files.append(tmp_annotation_vcf_name) 4335 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4336 tmp_annotation_vcf_name_err = ( 4337 tmp_annotation_vcf_name + ".err" 4338 ) 4339 err_files.append(tmp_annotation_vcf_name_err) 4340 4341 # Annotate Command 4342 log.debug( 4343 f"Annotation '{annotation}' - add bcftools command" 4344 ) 4345 4346 # Command 4347 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4348 4349 # Add command 4350 commands.append(command_annotate) 4351 4352 # if some commands 4353 if commands: 4354 4355 # Export VCF file 4356 self.export_variant_vcf( 4357 vcf_file=tmp_vcf_name, 4358 remove_info=True, 4359 add_samples=False, 4360 index=True, 4361 ) 4362 4363 # Threads 4364 # calculate threads for annotated commands 4365 if commands: 4366 threads_bcftools_annotate = round(threads / len(commands)) 4367 else: 4368 threads_bcftools_annotate = 1 4369 4370 if not threads_bcftools_annotate: 4371 threads_bcftools_annotate = 1 4372 4373 # Add threads option to bcftools commands 4374 if threads_bcftools_annotate > 1: 4375 commands_threaded = [] 4376 for command in commands: 4377 commands_threaded.append( 4378 command.replace( 4379 f"{bcftools_bin_command} annotate ", 4380 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4381 ) 4382 ) 4383 commands = commands_threaded 4384 4385 # Command annotation multithreading 4386 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4387 log.info( 4388 f"Annotation - Annotation multithreaded in " 4389 + str(len(commands)) 4390 + " commands" 4391 ) 4392 4393 run_parallel_commands(commands, threads) 4394 4395 # Merge 4396 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4397 4398 if tmp_ann_vcf_list_cmd: 4399 4400 # Tmp file 4401 tmp_annotate_vcf = NamedTemporaryFile( 4402 prefix=self.get_prefix(), 4403 dir=self.get_tmp_dir(), 4404 suffix=".vcf.gz", 4405 delete=True, 4406 ) 4407 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4408 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4409 err_files.append(tmp_annotate_vcf_name_err) 4410 4411 # Tmp file remove command 4412 tmp_files_remove_command = "" 4413 if tmp_files: 4414 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4415 4416 # Command merge 4417 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4418 log.info( 4419 f"Annotation - Annotation merging " 4420 + str(len(commands)) 4421 + " annotated files" 4422 ) 4423 log.debug(f"Annotation - merge command: {merge_command}") 4424 run_parallel_commands([merge_command], 1) 4425 4426 # Error messages 4427 log.info(f"Error/Warning messages:") 4428 error_message_command_all = [] 4429 error_message_command_warning = [] 4430 error_message_command_err = [] 4431 for err_file in err_files: 4432 with open(err_file, "r") as f: 4433 for line in f: 4434 message = line.strip() 4435 error_message_command_all.append(message) 4436 if line.startswith("[W::"): 4437 error_message_command_warning.append(message) 4438 if line.startswith("[E::"): 4439 error_message_command_err.append( 4440 f"{err_file}: " + message 4441 ) 4442 # log info 4443 for message in list( 4444 set(error_message_command_err + error_message_command_warning) 4445 ): 4446 log.info(f" {message}") 4447 # debug info 4448 for message in list(set(error_message_command_all)): 4449 log.debug(f" {message}") 4450 # failed 4451 if len(error_message_command_err): 4452 log.error("Annotation failed: Error in commands") 4453 raise ValueError("Annotation failed: Error in commands") 4454 4455 # Update variants 4456 log.info(f"Annotation - Updating...") 4457 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4459 def annotation_exomiser(self, threads: int = None) -> None: 4460 """ 4461 This function annotate with Exomiser 4462 4463 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4464 - "analysis" (dict/file): 4465 Full analysis dictionnary parameters (see Exomiser docs). 4466 Either a dict, or a file in JSON or YAML format. 4467 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4468 Default : None 4469 - "preset" (string): 4470 Analysis preset (available in config folder). 4471 Used if no full "analysis" is provided. 4472 Default: "exome" 4473 - "phenopacket" (dict/file): 4474 Samples and phenotipic features parameters (see Exomiser docs). 4475 Either a dict, or a file in JSON or YAML format. 4476 Default: None 4477 - "subject" (dict): 4478 Sample parameters (see Exomiser docs). 4479 Example: 4480 "subject": 4481 { 4482 "id": "ISDBM322017", 4483 "sex": "FEMALE" 4484 } 4485 Default: None 4486 - "sample" (string): 4487 Sample name to construct "subject" section: 4488 "subject": 4489 { 4490 "id": "<sample>", 4491 "sex": "UNKNOWN_SEX" 4492 } 4493 Default: None 4494 - "phenotypicFeatures" (dict) 4495 Phenotypic features to construct "subject" section. 4496 Example: 4497 "phenotypicFeatures": 4498 [ 4499 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4500 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4501 ] 4502 - "hpo" (list) 4503 List of HPO ids as phenotypic features. 4504 Example: 4505 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4506 Default: [] 4507 - "outputOptions" (dict): 4508 Output options (see Exomiser docs). 4509 Default: 4510 "output_options" = 4511 { 4512 "outputContributingVariantsOnly": False, 4513 "numGenes": 0, 4514 "outputFormats": ["TSV_VARIANT", "VCF"] 4515 } 4516 - "transcript_source" (string): 4517 Transcript source (either "refseq", "ucsc", "ensembl") 4518 Default: "refseq" 4519 - "exomiser_to_info" (boolean): 4520 Add exomiser TSV file columns as INFO fields in VCF. 4521 Default: False 4522 - "release" (string): 4523 Exomise database release. 4524 If not exists, database release will be downloaded (take a while). 4525 Default: None (provided by application.properties configuration file) 4526 - "exomiser_application_properties" (file): 4527 Exomiser configuration file (see Exomiser docs). 4528 Useful to automatically download databases (especially for specific genome databases). 4529 4530 Notes: 4531 - If no sample in parameters, first sample in VCF will be chosen 4532 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4533 4534 :param threads: The number of threads to use 4535 :return: None. 4536 """ 4537 4538 # DEBUG 4539 log.debug("Start annotation with Exomiser databases") 4540 4541 # Threads 4542 if not threads: 4543 threads = self.get_threads() 4544 log.debug("Threads: " + str(threads)) 4545 4546 # Config 4547 config = self.get_config() 4548 log.debug("Config: " + str(config)) 4549 4550 # Config - Folders - Databases 4551 databases_folders = ( 4552 config.get("folders", {}) 4553 .get("databases", {}) 4554 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4555 ) 4556 databases_folders = full_path(databases_folders) 4557 if not os.path.exists(databases_folders): 4558 log.error(f"Databases annotations: {databases_folders} NOT found") 4559 log.debug("Databases annotations: " + str(databases_folders)) 4560 4561 # Config - Exomiser 4562 exomiser_bin_command = get_bin_command( 4563 bin="exomiser-cli*.jar", 4564 tool="exomiser", 4565 bin_type="jar", 4566 config=config, 4567 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4568 ) 4569 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4570 if not exomiser_bin_command: 4571 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4572 log.error(msg_err) 4573 raise ValueError(msg_err) 4574 4575 # Param 4576 param = self.get_param() 4577 log.debug("Param: " + str(param)) 4578 4579 # Param - Exomiser 4580 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4581 log.debug(f"Param Exomiser: {param_exomiser}") 4582 4583 # Param - Assembly 4584 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4585 log.debug("Assembly: " + str(assembly)) 4586 4587 # Data 4588 table_variants = self.get_table_variants() 4589 4590 # Check if not empty 4591 log.debug("Check if not empty") 4592 sql_query_chromosomes = ( 4593 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4594 ) 4595 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4596 log.info(f"VCF empty") 4597 return False 4598 4599 # VCF header 4600 vcf_reader = self.get_header() 4601 log.debug("Initial header: " + str(vcf_reader.infos)) 4602 4603 # Samples 4604 samples = self.get_header_sample_list() 4605 if not samples: 4606 log.error("No Samples in VCF") 4607 return False 4608 log.debug(f"Samples: {samples}") 4609 4610 # Memory limit 4611 memory_limit = self.get_memory("8G") 4612 log.debug(f"memory_limit: {memory_limit}") 4613 4614 # Exomiser java options 4615 exomiser_java_options = ( 4616 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4617 ) 4618 log.debug(f"Exomiser java options: {exomiser_java_options}") 4619 4620 # Download Exomiser (if not exists) 4621 exomiser_release = param_exomiser.get("release", None) 4622 exomiser_application_properties = param_exomiser.get( 4623 "exomiser_application_properties", None 4624 ) 4625 databases_download_exomiser( 4626 assemblies=[assembly], 4627 exomiser_folder=databases_folders, 4628 exomiser_release=exomiser_release, 4629 exomiser_phenotype_release=exomiser_release, 4630 exomiser_application_properties=exomiser_application_properties, 4631 ) 4632 4633 # Force annotation 4634 force_update_annotation = True 4635 4636 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4637 log.debug("Start annotation Exomiser") 4638 4639 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4640 4641 # tmp_dir = "/tmp/exomiser" 4642 4643 ### ANALYSIS ### 4644 ################ 4645 4646 # Create analysis.json through analysis dict 4647 # either analysis in param or by default 4648 # depending on preset exome/genome) 4649 4650 # Init analysis dict 4651 param_exomiser_analysis_dict = {} 4652 4653 # analysis from param 4654 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4655 param_exomiser_analysis = full_path(param_exomiser_analysis) 4656 4657 # If analysis in param -> load anlaysis json 4658 if param_exomiser_analysis: 4659 4660 # If param analysis is a file and exists 4661 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4662 param_exomiser_analysis 4663 ): 4664 # Load analysis file into analysis dict (either yaml or json) 4665 with open(param_exomiser_analysis) as json_file: 4666 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4667 4668 # If param analysis is a dict 4669 elif isinstance(param_exomiser_analysis, dict): 4670 # Load analysis dict into analysis dict (either yaml or json) 4671 param_exomiser_analysis_dict = param_exomiser_analysis 4672 4673 # Error analysis type 4674 else: 4675 log.error(f"Analysis type unknown. Check param file.") 4676 raise ValueError(f"Analysis type unknown. Check param file.") 4677 4678 # Case no input analysis config file/dict 4679 # Use preset (exome/genome) to open default config file 4680 if not param_exomiser_analysis_dict: 4681 4682 # default preset 4683 default_preset = "exome" 4684 4685 # Get param preset or default preset 4686 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4687 4688 # Try to find if preset is a file 4689 if os.path.exists(param_exomiser_preset): 4690 # Preset file is provided in full path 4691 param_exomiser_analysis_default_config_file = ( 4692 param_exomiser_preset 4693 ) 4694 # elif os.path.exists(full_path(param_exomiser_preset)): 4695 # # Preset file is provided in full path 4696 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4697 elif os.path.exists( 4698 os.path.join(folder_config, param_exomiser_preset) 4699 ): 4700 # Preset file is provided a basename in config folder (can be a path with subfolders) 4701 param_exomiser_analysis_default_config_file = os.path.join( 4702 folder_config, param_exomiser_preset 4703 ) 4704 else: 4705 # Construct preset file 4706 param_exomiser_analysis_default_config_file = os.path.join( 4707 folder_config, 4708 f"preset-{param_exomiser_preset}-analysis.json", 4709 ) 4710 4711 # If preset file exists 4712 param_exomiser_analysis_default_config_file = full_path( 4713 param_exomiser_analysis_default_config_file 4714 ) 4715 if os.path.exists(param_exomiser_analysis_default_config_file): 4716 # Load prest file into analysis dict (either yaml or json) 4717 with open( 4718 param_exomiser_analysis_default_config_file 4719 ) as json_file: 4720 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4721 json_file 4722 ) 4723 4724 # Error preset file 4725 else: 4726 log.error( 4727 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4728 ) 4729 raise ValueError( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 4733 # If no analysis dict created 4734 if not param_exomiser_analysis_dict: 4735 log.error(f"No analysis config") 4736 raise ValueError(f"No analysis config") 4737 4738 # Log 4739 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4740 4741 ### PHENOPACKET ### 4742 ################### 4743 4744 # If no PhenoPacket in analysis dict -> check in param 4745 if "phenopacket" not in param_exomiser_analysis_dict: 4746 4747 # If PhenoPacket in param -> load anlaysis json 4748 if param_exomiser.get("phenopacket", None): 4749 4750 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4751 param_exomiser_phenopacket = full_path( 4752 param_exomiser_phenopacket 4753 ) 4754 4755 # If param phenopacket is a file and exists 4756 if isinstance( 4757 param_exomiser_phenopacket, str 4758 ) and os.path.exists(param_exomiser_phenopacket): 4759 # Load phenopacket file into analysis dict (either yaml or json) 4760 with open(param_exomiser_phenopacket) as json_file: 4761 param_exomiser_analysis_dict["phenopacket"] = ( 4762 yaml.safe_load(json_file) 4763 ) 4764 4765 # If param phenopacket is a dict 4766 elif isinstance(param_exomiser_phenopacket, dict): 4767 # Load phenopacket dict into analysis dict (either yaml or json) 4768 param_exomiser_analysis_dict["phenopacket"] = ( 4769 param_exomiser_phenopacket 4770 ) 4771 4772 # Error phenopacket type 4773 else: 4774 log.error(f"Phenopacket type unknown. Check param file.") 4775 raise ValueError( 4776 f"Phenopacket type unknown. Check param file." 4777 ) 4778 4779 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4780 if "phenopacket" not in param_exomiser_analysis_dict: 4781 4782 # Init PhenoPacket 4783 param_exomiser_analysis_dict["phenopacket"] = { 4784 "id": "analysis", 4785 "proband": {}, 4786 } 4787 4788 ### Add subject ### 4789 4790 # If subject exists 4791 param_exomiser_subject = param_exomiser.get("subject", {}) 4792 4793 # If subject not exists -> found sample ID 4794 if not param_exomiser_subject: 4795 4796 # Found sample ID in param 4797 sample = param_exomiser.get("sample", None) 4798 4799 # Find sample ID (first sample) 4800 if not sample: 4801 sample_list = self.get_header_sample_list() 4802 if len(sample_list) > 0: 4803 sample = sample_list[0] 4804 else: 4805 log.error(f"No sample found") 4806 raise ValueError(f"No sample found") 4807 4808 # Create subject 4809 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4810 4811 # Add to dict 4812 param_exomiser_analysis_dict["phenopacket"][ 4813 "subject" 4814 ] = param_exomiser_subject 4815 4816 ### Add "phenotypicFeatures" ### 4817 4818 # If phenotypicFeatures exists 4819 param_exomiser_phenotypicfeatures = param_exomiser.get( 4820 "phenotypicFeatures", [] 4821 ) 4822 4823 # If phenotypicFeatures not exists -> Try to infer from hpo list 4824 if not param_exomiser_phenotypicfeatures: 4825 4826 # Found HPO in param 4827 param_exomiser_hpo = param_exomiser.get("hpo", []) 4828 4829 # Split HPO if list in string format separated by comma 4830 if isinstance(param_exomiser_hpo, str): 4831 param_exomiser_hpo = param_exomiser_hpo.split(",") 4832 4833 # Create HPO list 4834 for hpo in param_exomiser_hpo: 4835 hpo_clean = re.sub("[^0-9]", "", hpo) 4836 param_exomiser_phenotypicfeatures.append( 4837 { 4838 "type": { 4839 "id": f"HP:{hpo_clean}", 4840 "label": f"HP:{hpo_clean}", 4841 } 4842 } 4843 ) 4844 4845 # Add to dict 4846 param_exomiser_analysis_dict["phenopacket"][ 4847 "phenotypicFeatures" 4848 ] = param_exomiser_phenotypicfeatures 4849 4850 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4851 if not param_exomiser_phenotypicfeatures: 4852 for step in param_exomiser_analysis_dict.get( 4853 "analysis", {} 4854 ).get("steps", []): 4855 if "hiPhivePrioritiser" in step: 4856 param_exomiser_analysis_dict.get("analysis", {}).get( 4857 "steps", [] 4858 ).remove(step) 4859 4860 ### Add Input File ### 4861 4862 # Initial file name and htsFiles 4863 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4864 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4865 { 4866 "uri": tmp_vcf_name, 4867 "htsFormat": "VCF", 4868 "genomeAssembly": assembly, 4869 } 4870 ] 4871 4872 ### Add metaData ### 4873 4874 # If metaData not in analysis dict 4875 if "metaData" not in param_exomiser_analysis_dict: 4876 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4877 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4878 "createdBy": "howard", 4879 "phenopacketSchemaVersion": 1, 4880 } 4881 4882 ### OutputOptions ### 4883 4884 # Init output result folder 4885 output_results = os.path.join(tmp_dir, "results") 4886 4887 # If no outputOptions in analysis dict 4888 if "outputOptions" not in param_exomiser_analysis_dict: 4889 4890 # default output formats 4891 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4892 4893 # Get outputOptions in param 4894 output_options = param_exomiser.get("outputOptions", None) 4895 4896 # If no output_options in param -> check 4897 if not output_options: 4898 output_options = { 4899 "outputContributingVariantsOnly": False, 4900 "numGenes": 0, 4901 "outputFormats": defaut_output_formats, 4902 } 4903 4904 # Replace outputDirectory in output options 4905 output_options["outputDirectory"] = output_results 4906 output_options["outputFileName"] = "howard" 4907 4908 # Add outputOptions in analysis dict 4909 param_exomiser_analysis_dict["outputOptions"] = output_options 4910 4911 else: 4912 4913 # Replace output_results and output format (if exists in param) 4914 param_exomiser_analysis_dict["outputOptions"][ 4915 "outputDirectory" 4916 ] = output_results 4917 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4918 list( 4919 set( 4920 param_exomiser_analysis_dict.get( 4921 "outputOptions", {} 4922 ).get("outputFormats", []) 4923 + ["TSV_VARIANT", "VCF"] 4924 ) 4925 ) 4926 ) 4927 4928 # log 4929 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4930 4931 ### ANALYSIS FILE ### 4932 ##################### 4933 4934 ### Full JSON analysis config file ### 4935 4936 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4937 with open(exomiser_analysis, "w") as fp: 4938 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4939 4940 ### SPLIT analysis and sample config files 4941 4942 # Splitted analysis dict 4943 param_exomiser_analysis_dict_for_split = ( 4944 param_exomiser_analysis_dict.copy() 4945 ) 4946 4947 # Phenopacket JSON file 4948 exomiser_analysis_phenopacket = os.path.join( 4949 tmp_dir, "analysis_phenopacket.json" 4950 ) 4951 with open(exomiser_analysis_phenopacket, "w") as fp: 4952 json.dump( 4953 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4954 fp, 4955 indent=4, 4956 ) 4957 4958 # Analysis JSON file without Phenopacket parameters 4959 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4960 exomiser_analysis_analysis = os.path.join( 4961 tmp_dir, "analysis_analysis.json" 4962 ) 4963 with open(exomiser_analysis_analysis, "w") as fp: 4964 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4965 4966 ### INITAL VCF file ### 4967 ####################### 4968 4969 ### Create list of samples to use and include inti initial VCF file #### 4970 4971 # Subject (main sample) 4972 # Get sample ID in analysis dict 4973 sample_subject = ( 4974 param_exomiser_analysis_dict.get("phenopacket", {}) 4975 .get("subject", {}) 4976 .get("id", None) 4977 ) 4978 sample_proband = ( 4979 param_exomiser_analysis_dict.get("phenopacket", {}) 4980 .get("proband", {}) 4981 .get("subject", {}) 4982 .get("id", None) 4983 ) 4984 sample = [] 4985 if sample_subject: 4986 sample.append(sample_subject) 4987 if sample_proband: 4988 sample.append(sample_proband) 4989 4990 # Get sample ID within Pedigree 4991 pedigree_persons_list = ( 4992 param_exomiser_analysis_dict.get("phenopacket", {}) 4993 .get("pedigree", {}) 4994 .get("persons", {}) 4995 ) 4996 4997 # Create list with all sample ID in pedigree (if exists) 4998 pedigree_persons = [] 4999 for person in pedigree_persons_list: 5000 pedigree_persons.append(person.get("individualId")) 5001 5002 # Concat subject sample ID and samples ID in pedigreesamples 5003 samples = list(set(sample + pedigree_persons)) 5004 5005 # Check if sample list is not empty 5006 if not samples: 5007 log.error(f"No samples found") 5008 raise ValueError(f"No samples found") 5009 5010 # Create VCF with sample (either sample in param or first one by default) 5011 # Export VCF file 5012 self.export_variant_vcf( 5013 vcf_file=tmp_vcf_name, 5014 remove_info=True, 5015 add_samples=True, 5016 list_samples=samples, 5017 index=False, 5018 ) 5019 5020 ### Execute Exomiser ### 5021 ######################## 5022 5023 # Init command 5024 exomiser_command = "" 5025 5026 # Command exomiser options 5027 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5028 5029 # Release 5030 exomiser_release = param_exomiser.get("release", None) 5031 if exomiser_release: 5032 # phenotype data version 5033 exomiser_options += ( 5034 f" --exomiser.phenotype.data-version={exomiser_release} " 5035 ) 5036 # data version 5037 exomiser_options += ( 5038 f" --exomiser.{assembly}.data-version={exomiser_release} " 5039 ) 5040 # variant white list 5041 variant_white_list_file = ( 5042 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5043 ) 5044 if os.path.exists( 5045 os.path.join( 5046 databases_folders, assembly, variant_white_list_file 5047 ) 5048 ): 5049 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5050 5051 # transcript_source 5052 transcript_source = param_exomiser.get( 5053 "transcript_source", None 5054 ) # ucsc, refseq, ensembl 5055 if transcript_source: 5056 exomiser_options += ( 5057 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5058 ) 5059 5060 # If analysis contain proband param 5061 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5062 "proband", {} 5063 ): 5064 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5065 5066 # If no proband (usually uniq sample) 5067 else: 5068 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5069 5070 # Log 5071 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5072 5073 # Run command 5074 result = subprocess.call( 5075 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5076 ) 5077 if result: 5078 log.error("Exomiser command failed") 5079 raise ValueError("Exomiser command failed") 5080 5081 ### RESULTS ### 5082 ############### 5083 5084 ### Annotate with TSV fields ### 5085 5086 # Init result tsv file 5087 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5088 5089 # Init result tsv file 5090 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5091 5092 # Parse TSV file and explode columns in INFO field 5093 if exomiser_to_info and os.path.exists(output_results_tsv): 5094 5095 # Log 5096 log.debug("Exomiser columns to VCF INFO field") 5097 5098 # Retrieve columns and types 5099 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5100 output_results_tsv_df = self.get_query_to_df(query) 5101 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5102 5103 # Init concat fields for update 5104 sql_query_update_concat_fields = [] 5105 5106 # Fields to avoid 5107 fields_to_avoid = [ 5108 "CONTIG", 5109 "START", 5110 "END", 5111 "REF", 5112 "ALT", 5113 "QUAL", 5114 "FILTER", 5115 "GENOTYPE", 5116 ] 5117 5118 # List all columns to add into header 5119 for header_column in output_results_tsv_columns: 5120 5121 # If header column is enable 5122 if header_column not in fields_to_avoid: 5123 5124 # Header info type 5125 header_info_type = "String" 5126 header_column_df = output_results_tsv_df[header_column] 5127 header_column_df_dtype = header_column_df.dtype 5128 if header_column_df_dtype == object: 5129 if ( 5130 pd.to_numeric(header_column_df, errors="coerce") 5131 .notnull() 5132 .all() 5133 ): 5134 header_info_type = "Float" 5135 else: 5136 header_info_type = "Integer" 5137 5138 # Header info 5139 characters_to_validate = ["-"] 5140 pattern = "[" + "".join(characters_to_validate) + "]" 5141 header_info_name = re.sub( 5142 pattern, 5143 "_", 5144 f"Exomiser_{header_column}".replace("#", ""), 5145 ) 5146 header_info_number = "." 5147 header_info_description = ( 5148 f"Exomiser {header_column} annotation" 5149 ) 5150 header_info_source = "Exomiser" 5151 header_info_version = "unknown" 5152 header_info_code = CODE_TYPE_MAP[header_info_type] 5153 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5154 header_info_name, 5155 header_info_number, 5156 header_info_type, 5157 header_info_description, 5158 header_info_source, 5159 header_info_version, 5160 header_info_code, 5161 ) 5162 5163 # Add field to add for update to concat fields 5164 sql_query_update_concat_fields.append( 5165 f""" 5166 CASE 5167 WHEN table_parquet."{header_column}" NOT IN ('','.') 5168 THEN concat( 5169 '{header_info_name}=', 5170 table_parquet."{header_column}", 5171 ';' 5172 ) 5173 5174 ELSE '' 5175 END 5176 """ 5177 ) 5178 5179 # Update query 5180 sql_query_update = f""" 5181 UPDATE {table_variants} as table_variants 5182 SET INFO = concat( 5183 CASE 5184 WHEN INFO NOT IN ('', '.') 5185 THEN INFO 5186 ELSE '' 5187 END, 5188 CASE 5189 WHEN table_variants.INFO NOT IN ('','.') 5190 THEN ';' 5191 ELSE '' 5192 END, 5193 ( 5194 SELECT 5195 concat( 5196 {",".join(sql_query_update_concat_fields)} 5197 ) 5198 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5199 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5200 AND table_parquet.\"START\" = table_variants.\"POS\" 5201 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5202 AND table_parquet.\"REF\" = table_variants.\"REF\" 5203 ) 5204 ) 5205 ; 5206 """ 5207 5208 # Update 5209 self.conn.execute(sql_query_update) 5210 5211 ### Annotate with VCF INFO field ### 5212 5213 # Init result VCF file 5214 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5215 5216 # If VCF exists 5217 if os.path.exists(output_results_vcf): 5218 5219 # Log 5220 log.debug("Exomiser result VCF update variants") 5221 5222 # Find Exomiser INFO field annotation in header 5223 with gzip.open(output_results_vcf, "rt") as f: 5224 header_list = self.read_vcf_header(f) 5225 exomiser_vcf_header = vcf.Reader( 5226 io.StringIO("\n".join(header_list)) 5227 ) 5228 5229 # Add annotation INFO field to header 5230 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5231 5232 # Update variants with VCF 5233 self.update_from_vcf(output_results_vcf) 5234 5235 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5237 def annotation_snpeff(self, threads: int = None) -> None: 5238 """ 5239 This function annotate with snpEff 5240 5241 :param threads: The number of threads to use 5242 :return: the value of the variable "return_value". 5243 """ 5244 5245 # DEBUG 5246 log.debug("Start annotation with snpeff databases") 5247 5248 # Threads 5249 if not threads: 5250 threads = self.get_threads() 5251 log.debug("Threads: " + str(threads)) 5252 5253 # DEBUG 5254 delete_tmp = True 5255 if self.get_config().get("verbosity", "warning") in ["debug"]: 5256 delete_tmp = False 5257 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5258 5259 # Config 5260 config = self.get_config() 5261 log.debug("Config: " + str(config)) 5262 5263 # Config - Folders - Databases 5264 databases_folders = ( 5265 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5266 ) 5267 log.debug("Databases annotations: " + str(databases_folders)) 5268 5269 # Config - snpEff bin command 5270 snpeff_bin_command = get_bin_command( 5271 bin="snpEff.jar", 5272 tool="snpeff", 5273 bin_type="jar", 5274 config=config, 5275 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5276 ) 5277 if not snpeff_bin_command: 5278 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5279 log.error(msg_err) 5280 raise ValueError(msg_err) 5281 5282 # Config - snpEff databases 5283 snpeff_databases = ( 5284 config.get("folders", {}) 5285 .get("databases", {}) 5286 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5287 ) 5288 snpeff_databases = full_path(snpeff_databases) 5289 if snpeff_databases is not None and snpeff_databases != "": 5290 log.debug(f"Create snpEff databases folder") 5291 if not os.path.exists(snpeff_databases): 5292 os.makedirs(snpeff_databases) 5293 5294 # Param 5295 param = self.get_param() 5296 log.debug("Param: " + str(param)) 5297 5298 # Param 5299 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5300 log.debug("Options: " + str(options)) 5301 5302 # Param - Assembly 5303 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5304 5305 # Param - Options 5306 snpeff_options = ( 5307 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5308 ) 5309 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5310 snpeff_csvstats = ( 5311 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5312 ) 5313 if snpeff_stats: 5314 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5315 snpeff_stats = full_path(snpeff_stats) 5316 snpeff_options += f" -stats {snpeff_stats}" 5317 if snpeff_csvstats: 5318 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5319 snpeff_csvstats = full_path(snpeff_csvstats) 5320 snpeff_options += f" -csvStats {snpeff_csvstats}" 5321 5322 # Data 5323 table_variants = self.get_table_variants() 5324 5325 # Check if not empty 5326 log.debug("Check if not empty") 5327 sql_query_chromosomes = ( 5328 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5329 ) 5330 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5331 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5332 log.info(f"VCF empty") 5333 return 5334 5335 # Export in VCF 5336 log.debug("Create initial file to annotate") 5337 tmp_vcf = NamedTemporaryFile( 5338 prefix=self.get_prefix(), 5339 dir=self.get_tmp_dir(), 5340 suffix=".vcf.gz", 5341 delete=True, 5342 ) 5343 tmp_vcf_name = tmp_vcf.name 5344 5345 # VCF header 5346 vcf_reader = self.get_header() 5347 log.debug("Initial header: " + str(vcf_reader.infos)) 5348 5349 # Existing annotations 5350 for vcf_annotation in self.get_header().infos: 5351 5352 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5353 log.debug( 5354 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5355 ) 5356 5357 # Memory limit 5358 # if config.get("memory", None): 5359 # memory_limit = config.get("memory", "8G") 5360 # else: 5361 # memory_limit = "8G" 5362 memory_limit = self.get_memory("8G") 5363 log.debug(f"memory_limit: {memory_limit}") 5364 5365 # snpEff java options 5366 snpeff_java_options = ( 5367 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5368 ) 5369 log.debug(f"Exomiser java options: {snpeff_java_options}") 5370 5371 force_update_annotation = True 5372 5373 if "ANN" not in self.get_header().infos or force_update_annotation: 5374 5375 # Check snpEff database 5376 log.debug(f"Check snpEff databases {[assembly]}") 5377 databases_download_snpeff( 5378 folder=snpeff_databases, assemblies=[assembly], config=config 5379 ) 5380 5381 # Export VCF file 5382 self.export_variant_vcf( 5383 vcf_file=tmp_vcf_name, 5384 remove_info=True, 5385 add_samples=False, 5386 index=True, 5387 ) 5388 5389 # Tmp file 5390 err_files = [] 5391 tmp_annotate_vcf = NamedTemporaryFile( 5392 prefix=self.get_prefix(), 5393 dir=self.get_tmp_dir(), 5394 suffix=".vcf", 5395 delete=False, 5396 ) 5397 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5398 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5399 err_files.append(tmp_annotate_vcf_name_err) 5400 5401 # Command 5402 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5403 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5404 run_parallel_commands([snpeff_command], 1) 5405 5406 # Error messages 5407 log.info(f"Error/Warning messages:") 5408 error_message_command_all = [] 5409 error_message_command_warning = [] 5410 error_message_command_err = [] 5411 for err_file in err_files: 5412 with open(err_file, "r") as f: 5413 for line in f: 5414 message = line.strip() 5415 error_message_command_all.append(message) 5416 if line.startswith("[W::"): 5417 error_message_command_warning.append(message) 5418 if line.startswith("[E::"): 5419 error_message_command_err.append(f"{err_file}: " + message) 5420 # log info 5421 for message in list( 5422 set(error_message_command_err + error_message_command_warning) 5423 ): 5424 log.info(f" {message}") 5425 # debug info 5426 for message in list(set(error_message_command_all)): 5427 log.debug(f" {message}") 5428 # failed 5429 if len(error_message_command_err): 5430 log.error("Annotation failed: Error in commands") 5431 raise ValueError("Annotation failed: Error in commands") 5432 5433 # Find annotation in header 5434 with open(tmp_annotate_vcf_name, "rt") as f: 5435 header_list = self.read_vcf_header(f) 5436 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5437 5438 for ann in annovar_vcf_header.infos: 5439 if ann not in self.get_header().infos: 5440 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5441 5442 # Update variants 5443 log.info(f"Annotation - Updating...") 5444 self.update_from_vcf(tmp_annotate_vcf_name) 5445 5446 else: 5447 if "ANN" in self.get_header().infos: 5448 log.debug(f"Existing snpEff annotations in VCF") 5449 if force_update_annotation: 5450 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5452 def annotation_annovar(self, threads: int = None) -> None: 5453 """ 5454 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5455 annotations 5456 5457 :param threads: number of threads to use 5458 :return: the value of the variable "return_value". 5459 """ 5460 5461 # DEBUG 5462 log.debug("Start annotation with Annovar databases") 5463 5464 # Threads 5465 if not threads: 5466 threads = self.get_threads() 5467 log.debug("Threads: " + str(threads)) 5468 5469 # Tmp en Err files 5470 tmp_files = [] 5471 err_files = [] 5472 5473 # DEBUG 5474 delete_tmp = True 5475 if self.get_config().get("verbosity", "warning") in ["debug"]: 5476 delete_tmp = False 5477 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5478 5479 # Config 5480 config = self.get_config() 5481 log.debug("Config: " + str(config)) 5482 5483 # Config - Folders - Databases 5484 databases_folders = ( 5485 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5486 ) 5487 log.debug("Databases annotations: " + str(databases_folders)) 5488 5489 # Config - annovar bin command 5490 annovar_bin_command = get_bin_command( 5491 bin="table_annovar.pl", 5492 tool="annovar", 5493 bin_type="perl", 5494 config=config, 5495 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5496 ) 5497 if not annovar_bin_command: 5498 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5499 log.error(msg_err) 5500 raise ValueError(msg_err) 5501 5502 # Config - BCFTools bin command 5503 bcftools_bin_command = get_bin_command( 5504 bin="bcftools", 5505 tool="bcftools", 5506 bin_type="bin", 5507 config=config, 5508 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5509 ) 5510 if not bcftools_bin_command: 5511 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5512 log.error(msg_err) 5513 raise ValueError(msg_err) 5514 5515 # Config - annovar databases 5516 annovar_databases = ( 5517 config.get("folders", {}) 5518 .get("databases", {}) 5519 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5520 ) 5521 if annovar_databases is not None: 5522 if isinstance(annovar_databases, list): 5523 annovar_databases = full_path(annovar_databases[0]) 5524 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5525 annovar_databases = full_path(annovar_databases) 5526 if not os.path.exists(annovar_databases): 5527 log.info(f"Annovar databases folder '{annovar_databases}' created") 5528 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5529 else: 5530 msg_err = f"Annovar databases configuration failed" 5531 log.error(msg_err) 5532 raise ValueError(msg_err) 5533 5534 # Param 5535 param = self.get_param() 5536 log.debug("Param: " + str(param)) 5537 5538 # Param - options 5539 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5540 log.debug("Options: " + str(options)) 5541 5542 # Param - annotations 5543 annotations = ( 5544 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5545 ) 5546 log.debug("Annotations: " + str(annotations)) 5547 5548 # Param - Assembly 5549 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5550 5551 # Annovar database assembly 5552 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5553 if annovar_databases_assembly != "" and not os.path.exists( 5554 annovar_databases_assembly 5555 ): 5556 os.makedirs(annovar_databases_assembly) 5557 5558 # Data 5559 table_variants = self.get_table_variants() 5560 5561 # Check if not empty 5562 log.debug("Check if not empty") 5563 sql_query_chromosomes = ( 5564 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5565 ) 5566 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5567 if not sql_query_chromosomes_df["count"][0]: 5568 log.info(f"VCF empty") 5569 return 5570 5571 # VCF header 5572 vcf_reader = self.get_header() 5573 log.debug("Initial header: " + str(vcf_reader.infos)) 5574 5575 # Existing annotations 5576 for vcf_annotation in self.get_header().infos: 5577 5578 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5579 log.debug( 5580 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5581 ) 5582 5583 force_update_annotation = True 5584 5585 if annotations: 5586 5587 commands = [] 5588 tmp_annotates_vcf_name_list = [] 5589 5590 # Export in VCF 5591 log.debug("Create initial file to annotate") 5592 tmp_vcf = NamedTemporaryFile( 5593 prefix=self.get_prefix(), 5594 dir=self.get_tmp_dir(), 5595 suffix=".vcf.gz", 5596 delete=False, 5597 ) 5598 tmp_vcf_name = tmp_vcf.name 5599 tmp_files.append(tmp_vcf_name) 5600 tmp_files.append(tmp_vcf_name + ".tbi") 5601 5602 # Export VCF file 5603 self.export_variant_vcf( 5604 vcf_file=tmp_vcf_name, 5605 remove_info=".", 5606 add_samples=False, 5607 index=True, 5608 ) 5609 5610 # Create file for field rename 5611 log.debug("Create file for field rename") 5612 tmp_rename = NamedTemporaryFile( 5613 prefix=self.get_prefix(), 5614 dir=self.get_tmp_dir(), 5615 suffix=".rename", 5616 delete=False, 5617 ) 5618 tmp_rename_name = tmp_rename.name 5619 tmp_files.append(tmp_rename_name) 5620 5621 # Check Annovar database 5622 log.debug( 5623 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5624 ) 5625 databases_download_annovar( 5626 folder=annovar_databases, 5627 files=list(annotations.keys()), 5628 assemblies=[assembly], 5629 ) 5630 5631 for annotation in annotations: 5632 annotation_fields = annotations[annotation] 5633 5634 if not annotation_fields: 5635 annotation_fields = {"INFO": None} 5636 5637 log.info(f"Annotations Annovar - database '{annotation}'") 5638 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5639 5640 # Tmp file for annovar 5641 err_files = [] 5642 tmp_annotate_vcf_directory = TemporaryDirectory( 5643 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5644 ) 5645 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5646 tmp_annotate_vcf_name_annovar = ( 5647 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5648 ) 5649 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5650 err_files.append(tmp_annotate_vcf_name_err) 5651 tmp_files.append(tmp_annotate_vcf_name_err) 5652 5653 # Tmp file final vcf annotated by annovar 5654 tmp_annotate_vcf = NamedTemporaryFile( 5655 prefix=self.get_prefix(), 5656 dir=self.get_tmp_dir(), 5657 suffix=".vcf.gz", 5658 delete=False, 5659 ) 5660 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5661 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5662 tmp_files.append(tmp_annotate_vcf_name) 5663 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5664 5665 # Number of fields 5666 annotation_list = [] 5667 annotation_renamed_list = [] 5668 5669 for annotation_field in annotation_fields: 5670 5671 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5672 annotation_fields_new_name = annotation_fields.get( 5673 annotation_field, annotation_field 5674 ) 5675 if not annotation_fields_new_name: 5676 annotation_fields_new_name = annotation_field 5677 5678 if ( 5679 force_update_annotation 5680 or annotation_fields_new_name not in self.get_header().infos 5681 ): 5682 annotation_list.append(annotation_field) 5683 annotation_renamed_list.append(annotation_fields_new_name) 5684 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5685 log.warning( 5686 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5687 ) 5688 5689 # Add rename info 5690 run_parallel_commands( 5691 [ 5692 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5693 ], 5694 1, 5695 ) 5696 5697 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5698 log.debug("annotation_list: " + str(annotation_list)) 5699 5700 # protocol 5701 protocol = annotation 5702 5703 # argument 5704 argument = "" 5705 5706 # operation 5707 operation = "f" 5708 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5709 "ensGene" 5710 ): 5711 operation = "g" 5712 if options.get("genebase", None): 5713 argument = f"""'{options.get("genebase","")}'""" 5714 elif annotation in ["cytoBand"]: 5715 operation = "r" 5716 5717 # argument option 5718 argument_option = "" 5719 if argument != "": 5720 argument_option = " --argument " + argument 5721 5722 # command options 5723 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5724 for option in options: 5725 if option not in ["genebase"]: 5726 command_options += f""" --{option}={options[option]}""" 5727 5728 # Command 5729 5730 # Command - Annovar 5731 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5732 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5733 5734 # Command - start pipe 5735 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5736 5737 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5738 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5739 5740 # Command - Special characters (refGene annotation) 5741 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5742 5743 # Command - Clean empty fields (with value ".") 5744 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5745 5746 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5747 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5748 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5749 # for ann in annotation_renamed_list: 5750 for ann in annotation_list: 5751 annovar_fields_to_keep.append(f"^INFO/{ann}") 5752 5753 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5754 5755 # Command - indexing 5756 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5757 5758 log.debug(f"Annotation - Annovar command: {command_annovar}") 5759 run_parallel_commands([command_annovar], 1) 5760 5761 # Error messages 5762 log.info(f"Error/Warning messages:") 5763 error_message_command_all = [] 5764 error_message_command_warning = [] 5765 error_message_command_err = [] 5766 for err_file in err_files: 5767 with open(err_file, "r") as f: 5768 for line in f: 5769 message = line.strip() 5770 error_message_command_all.append(message) 5771 if line.startswith("[W::") or line.startswith("WARNING"): 5772 error_message_command_warning.append(message) 5773 if line.startswith("[E::") or line.startswith("ERROR"): 5774 error_message_command_err.append( 5775 f"{err_file}: " + message 5776 ) 5777 # log info 5778 for message in list( 5779 set(error_message_command_err + error_message_command_warning) 5780 ): 5781 log.info(f" {message}") 5782 # debug info 5783 for message in list(set(error_message_command_all)): 5784 log.debug(f" {message}") 5785 # failed 5786 if len(error_message_command_err): 5787 log.error("Annotation failed: Error in commands") 5788 raise ValueError("Annotation failed: Error in commands") 5789 5790 if tmp_annotates_vcf_name_list: 5791 5792 # List of annotated files 5793 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5794 5795 # Tmp file 5796 tmp_annotate_vcf = NamedTemporaryFile( 5797 prefix=self.get_prefix(), 5798 dir=self.get_tmp_dir(), 5799 suffix=".vcf.gz", 5800 delete=False, 5801 ) 5802 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5803 tmp_files.append(tmp_annotate_vcf_name) 5804 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5805 err_files.append(tmp_annotate_vcf_name_err) 5806 tmp_files.append(tmp_annotate_vcf_name_err) 5807 5808 # Command merge 5809 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5810 log.info( 5811 f"Annotation Annovar - Annotation merging " 5812 + str(len(tmp_annotates_vcf_name_list)) 5813 + " annotated files" 5814 ) 5815 log.debug(f"Annotation - merge command: {merge_command}") 5816 run_parallel_commands([merge_command], 1) 5817 5818 # Find annotation in header 5819 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5820 header_list = self.read_vcf_header(f) 5821 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5822 5823 for ann in annovar_vcf_header.infos: 5824 if ann not in self.get_header().infos: 5825 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5826 5827 # Update variants 5828 log.info(f"Annotation Annovar - Updating...") 5829 self.update_from_vcf(tmp_annotate_vcf_name) 5830 5831 # Clean files 5832 # Tmp file remove command 5833 if True: 5834 tmp_files_remove_command = "" 5835 if tmp_files: 5836 tmp_files_remove_command = " ".join(tmp_files) 5837 clean_command = f" rm -f {tmp_files_remove_command} " 5838 log.debug(f"Annotation Annovar - Annotation cleaning ") 5839 log.debug(f"Annotation - cleaning command: {clean_command}") 5840 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5843 def annotation_parquet(self, threads: int = None) -> None: 5844 """ 5845 It takes a VCF file, and annotates it with a parquet file 5846 5847 :param threads: number of threads to use for the annotation 5848 :return: the value of the variable "result". 5849 """ 5850 5851 # DEBUG 5852 log.debug("Start annotation with parquet databases") 5853 5854 # Threads 5855 if not threads: 5856 threads = self.get_threads() 5857 log.debug("Threads: " + str(threads)) 5858 5859 # DEBUG 5860 delete_tmp = True 5861 if self.get_config().get("verbosity", "warning") in ["debug"]: 5862 delete_tmp = False 5863 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5864 5865 # Config 5866 databases_folders = set( 5867 self.get_config() 5868 .get("folders", {}) 5869 .get("databases", {}) 5870 .get("annotations", ["."]) 5871 + self.get_config() 5872 .get("folders", {}) 5873 .get("databases", {}) 5874 .get("parquet", ["."]) 5875 ) 5876 log.debug("Databases annotations: " + str(databases_folders)) 5877 5878 # Param 5879 annotations = ( 5880 self.get_param() 5881 .get("annotation", {}) 5882 .get("parquet", {}) 5883 .get("annotations", None) 5884 ) 5885 log.debug("Annotations: " + str(annotations)) 5886 5887 # Assembly 5888 assembly = self.get_param().get( 5889 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5890 ) 5891 5892 # Force Update Annotation 5893 force_update_annotation = ( 5894 self.get_param() 5895 .get("annotation", {}) 5896 .get("options", {}) 5897 .get("annotations_update", False) 5898 ) 5899 log.debug(f"force_update_annotation={force_update_annotation}") 5900 force_append_annotation = ( 5901 self.get_param() 5902 .get("annotation", {}) 5903 .get("options", {}) 5904 .get("annotations_append", False) 5905 ) 5906 log.debug(f"force_append_annotation={force_append_annotation}") 5907 5908 # Data 5909 table_variants = self.get_table_variants() 5910 5911 # Check if not empty 5912 log.debug("Check if not empty") 5913 sql_query_chromosomes_df = self.get_query_to_df( 5914 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5915 ) 5916 if not sql_query_chromosomes_df["count"][0]: 5917 log.info(f"VCF empty") 5918 return 5919 5920 # VCF header 5921 vcf_reader = self.get_header() 5922 log.debug("Initial header: " + str(vcf_reader.infos)) 5923 5924 # Nb Variants POS 5925 log.debug("NB Variants Start") 5926 nb_variants = self.conn.execute( 5927 f"SELECT count(*) AS count FROM variants" 5928 ).fetchdf()["count"][0] 5929 log.debug("NB Variants Stop") 5930 5931 # Existing annotations 5932 for vcf_annotation in self.get_header().infos: 5933 5934 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5935 log.debug( 5936 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5937 ) 5938 5939 # Added columns 5940 added_columns = [] 5941 5942 # drop indexes 5943 log.debug(f"Drop indexes...") 5944 self.drop_indexes() 5945 5946 if annotations: 5947 5948 if "ALL" in annotations: 5949 5950 all_param = annotations.get("ALL", {}) 5951 all_param_formats = all_param.get("formats", None) 5952 all_param_releases = all_param.get("releases", None) 5953 5954 databases_infos_dict = self.scan_databases( 5955 database_formats=all_param_formats, 5956 database_releases=all_param_releases, 5957 ) 5958 for database_infos in databases_infos_dict.keys(): 5959 if database_infos not in annotations: 5960 annotations[database_infos] = {"INFO": None} 5961 5962 for annotation in annotations: 5963 5964 if annotation in ["ALL"]: 5965 continue 5966 5967 # Annotation Name 5968 annotation_name = os.path.basename(annotation) 5969 5970 # Annotation fields 5971 annotation_fields = annotations[annotation] 5972 if not annotation_fields: 5973 annotation_fields = {"INFO": None} 5974 5975 log.debug(f"Annotation '{annotation_name}'") 5976 log.debug( 5977 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5978 ) 5979 5980 # Create Database 5981 database = Database( 5982 database=annotation, 5983 databases_folders=databases_folders, 5984 assembly=assembly, 5985 ) 5986 5987 # Find files 5988 parquet_file = database.get_database() 5989 parquet_hdr_file = database.get_header_file() 5990 parquet_type = database.get_type() 5991 5992 # Check if files exists 5993 if not parquet_file or not parquet_hdr_file: 5994 msg_err_list = [] 5995 if not parquet_file: 5996 msg_err_list.append( 5997 f"Annotation failed: Annotation file not found" 5998 ) 5999 if parquet_file and not parquet_hdr_file: 6000 msg_err_list.append( 6001 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6002 ) 6003 6004 log.error(". ".join(msg_err_list)) 6005 raise ValueError(". ".join(msg_err_list)) 6006 else: 6007 # Get parquet connexion 6008 parquet_sql_attach = database.get_sql_database_attach( 6009 output="query" 6010 ) 6011 if parquet_sql_attach: 6012 self.conn.execute(parquet_sql_attach) 6013 parquet_file_link = database.get_sql_database_link() 6014 # Log 6015 log.debug( 6016 f"Annotation '{annotation_name}' - file: " 6017 + str(parquet_file) 6018 + " and " 6019 + str(parquet_hdr_file) 6020 ) 6021 6022 # Database full header columns 6023 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6024 parquet_hdr_file 6025 ) 6026 # Log 6027 log.debug( 6028 "Annotation database header columns : " 6029 + str(parquet_hdr_vcf_header_columns) 6030 ) 6031 6032 # Load header as VCF object 6033 parquet_hdr_vcf_header_infos = database.get_header().infos 6034 # Log 6035 log.debug( 6036 "Annotation database header: " 6037 + str(parquet_hdr_vcf_header_infos) 6038 ) 6039 6040 # Get extra infos 6041 parquet_columns = database.get_extra_columns() 6042 # Log 6043 log.debug("Annotation database Columns: " + str(parquet_columns)) 6044 6045 # Add extra columns if "ALL" in annotation_fields 6046 # if "ALL" in annotation_fields: 6047 # allow_add_extra_column = True 6048 if "ALL" in annotation_fields and database.get_extra_columns(): 6049 for extra_column in database.get_extra_columns(): 6050 if ( 6051 extra_column not in annotation_fields 6052 and extra_column.replace("INFO/", "") 6053 not in parquet_hdr_vcf_header_infos 6054 ): 6055 parquet_hdr_vcf_header_infos[extra_column] = ( 6056 vcf.parser._Info( 6057 extra_column, 6058 ".", 6059 "String", 6060 f"{extra_column} description", 6061 "unknown", 6062 "unknown", 6063 self.code_type_map["String"], 6064 ) 6065 ) 6066 6067 # For all fields in database 6068 annotation_fields_all = False 6069 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6070 annotation_fields_all = True 6071 annotation_fields = { 6072 key: key for key in parquet_hdr_vcf_header_infos 6073 } 6074 6075 log.debug( 6076 "Annotation database header - All annotations added: " 6077 + str(annotation_fields) 6078 ) 6079 6080 # Init 6081 6082 # List of annotation fields to use 6083 sql_query_annotation_update_info_sets = [] 6084 6085 # List of annotation to agregate 6086 sql_query_annotation_to_agregate = [] 6087 6088 # Number of fields 6089 nb_annotation_field = 0 6090 6091 # Annotation fields processed 6092 annotation_fields_processed = [] 6093 6094 # Columns mapping 6095 map_columns = database.map_columns( 6096 columns=annotation_fields, prefixes=["INFO/"] 6097 ) 6098 6099 # Query dict for fields to remove (update option) 6100 query_dict_remove = {} 6101 6102 # Fetch Anotation fields 6103 for annotation_field in annotation_fields: 6104 6105 # annotation_field_column 6106 annotation_field_column = map_columns.get( 6107 annotation_field, "INFO" 6108 ) 6109 6110 # field new name, if parametered 6111 annotation_fields_new_name = annotation_fields.get( 6112 annotation_field, annotation_field 6113 ) 6114 if not annotation_fields_new_name: 6115 annotation_fields_new_name = annotation_field 6116 6117 # To annotate 6118 # force_update_annotation = True 6119 # force_append_annotation = True 6120 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6121 if annotation_field in parquet_hdr_vcf_header_infos and ( 6122 force_update_annotation 6123 or force_append_annotation 6124 or ( 6125 annotation_fields_new_name 6126 not in self.get_header().infos 6127 ) 6128 ): 6129 6130 # Add field to annotation to process list 6131 annotation_fields_processed.append( 6132 annotation_fields_new_name 6133 ) 6134 6135 # explode infos for the field 6136 annotation_fields_new_name_info_msg = "" 6137 if ( 6138 force_update_annotation 6139 and annotation_fields_new_name 6140 in self.get_header().infos 6141 ): 6142 # Remove field from INFO 6143 query = f""" 6144 UPDATE {table_variants} as table_variants 6145 SET INFO = REGEXP_REPLACE( 6146 concat(table_variants.INFO,''), 6147 ';*{annotation_fields_new_name}=[^;]*', 6148 '' 6149 ) 6150 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6151 """ 6152 annotation_fields_new_name_info_msg = " [update]" 6153 query_dict_remove[ 6154 f"remove 'INFO/{annotation_fields_new_name}'" 6155 ] = query 6156 6157 # Sep between fields in INFO 6158 nb_annotation_field += 1 6159 if nb_annotation_field > 1: 6160 annotation_field_sep = ";" 6161 else: 6162 annotation_field_sep = "" 6163 6164 log.info( 6165 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6166 ) 6167 6168 # Add INFO field to header 6169 parquet_hdr_vcf_header_infos_number = ( 6170 parquet_hdr_vcf_header_infos[annotation_field].num 6171 or "." 6172 ) 6173 parquet_hdr_vcf_header_infos_type = ( 6174 parquet_hdr_vcf_header_infos[annotation_field].type 6175 or "String" 6176 ) 6177 parquet_hdr_vcf_header_infos_description = ( 6178 parquet_hdr_vcf_header_infos[annotation_field].desc 6179 or f"{annotation_field} description" 6180 ) 6181 parquet_hdr_vcf_header_infos_source = ( 6182 parquet_hdr_vcf_header_infos[annotation_field].source 6183 or "unknown" 6184 ) 6185 parquet_hdr_vcf_header_infos_version = ( 6186 parquet_hdr_vcf_header_infos[annotation_field].version 6187 or "unknown" 6188 ) 6189 6190 vcf_reader.infos[annotation_fields_new_name] = ( 6191 vcf.parser._Info( 6192 annotation_fields_new_name, 6193 parquet_hdr_vcf_header_infos_number, 6194 parquet_hdr_vcf_header_infos_type, 6195 parquet_hdr_vcf_header_infos_description, 6196 parquet_hdr_vcf_header_infos_source, 6197 parquet_hdr_vcf_header_infos_version, 6198 self.code_type_map[ 6199 parquet_hdr_vcf_header_infos_type 6200 ], 6201 ) 6202 ) 6203 6204 # Append 6205 if force_append_annotation: 6206 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6207 else: 6208 query_case_when_append = "" 6209 6210 # Annotation/Update query fields 6211 # Found in INFO column 6212 if ( 6213 annotation_field_column == "INFO" 6214 and "INFO" in parquet_hdr_vcf_header_columns 6215 ): 6216 sql_query_annotation_update_info_sets.append( 6217 f""" 6218 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6219 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6220 ELSE '' 6221 END 6222 """ 6223 ) 6224 # Found in a specific column 6225 else: 6226 sql_query_annotation_update_info_sets.append( 6227 f""" 6228 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6229 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6230 ELSE '' 6231 END 6232 """ 6233 ) 6234 sql_query_annotation_to_agregate.append( 6235 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6236 ) 6237 6238 # Not to annotate 6239 else: 6240 6241 if force_update_annotation: 6242 annotation_message = "forced" 6243 else: 6244 annotation_message = "skipped" 6245 6246 if annotation_field not in parquet_hdr_vcf_header_infos: 6247 log.warning( 6248 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6249 ) 6250 if annotation_fields_new_name in self.get_header().infos: 6251 log.warning( 6252 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6253 ) 6254 6255 # Check if ALL fields have to be annotated. Thus concat all INFO field 6256 # allow_annotation_full_info = True 6257 allow_annotation_full_info = not force_append_annotation 6258 6259 if parquet_type in ["regions"]: 6260 allow_annotation_full_info = False 6261 6262 if ( 6263 allow_annotation_full_info 6264 and nb_annotation_field == len(annotation_fields) 6265 and annotation_fields_all 6266 and ( 6267 "INFO" in parquet_hdr_vcf_header_columns 6268 and "INFO" in database.get_extra_columns() 6269 ) 6270 ): 6271 log.debug("Column INFO annotation enabled") 6272 sql_query_annotation_update_info_sets = [] 6273 sql_query_annotation_update_info_sets.append( 6274 f" table_parquet.INFO " 6275 ) 6276 6277 if sql_query_annotation_update_info_sets: 6278 6279 # Annotate 6280 log.info(f"Annotation '{annotation_name}' - Annotation...") 6281 6282 # Join query annotation update info sets for SQL 6283 sql_query_annotation_update_info_sets_sql = ",".join( 6284 sql_query_annotation_update_info_sets 6285 ) 6286 6287 # Check chromosomes list (and variants infos) 6288 sql_query_chromosomes = f""" 6289 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6290 FROM {table_variants} as table_variants 6291 GROUP BY table_variants."#CHROM" 6292 ORDER BY table_variants."#CHROM" 6293 """ 6294 sql_query_chromosomes_df = self.conn.execute( 6295 sql_query_chromosomes 6296 ).df() 6297 sql_query_chromosomes_dict = { 6298 entry["CHROM"]: { 6299 "count": entry["count_variants"], 6300 "min": entry["min_variants"], 6301 "max": entry["max_variants"], 6302 } 6303 for index, entry in sql_query_chromosomes_df.iterrows() 6304 } 6305 6306 # Init 6307 nb_of_query = 0 6308 nb_of_variant_annotated = 0 6309 query_dict = query_dict_remove 6310 6311 # for chrom in sql_query_chromosomes_df["CHROM"]: 6312 for chrom in sql_query_chromosomes_dict: 6313 6314 # Number of variant by chromosome 6315 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6316 chrom, {} 6317 ).get("count", 0) 6318 6319 log.debug( 6320 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6321 ) 6322 6323 # Annotation with regions database 6324 if parquet_type in ["regions"]: 6325 sql_query_annotation_from_clause = f""" 6326 FROM ( 6327 SELECT 6328 '{chrom}' AS \"#CHROM\", 6329 table_variants_from.\"POS\" AS \"POS\", 6330 {",".join(sql_query_annotation_to_agregate)} 6331 FROM {table_variants} as table_variants_from 6332 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6333 table_parquet_from."#CHROM" = '{chrom}' 6334 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6335 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6336 ) 6337 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6338 GROUP BY table_variants_from.\"POS\" 6339 ) 6340 as table_parquet 6341 """ 6342 6343 sql_query_annotation_where_clause = """ 6344 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6345 AND table_parquet.\"POS\" = table_variants.\"POS\" 6346 """ 6347 6348 # Annotation with variants database 6349 else: 6350 sql_query_annotation_from_clause = f""" 6351 FROM {parquet_file_link} as table_parquet 6352 """ 6353 sql_query_annotation_where_clause = f""" 6354 table_variants."#CHROM" = '{chrom}' 6355 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6356 AND table_parquet.\"POS\" = table_variants.\"POS\" 6357 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6358 AND table_parquet.\"REF\" = table_variants.\"REF\" 6359 """ 6360 6361 # Create update query 6362 sql_query_annotation_chrom_interval_pos = f""" 6363 UPDATE {table_variants} as table_variants 6364 SET INFO = 6365 concat( 6366 CASE WHEN table_variants.INFO NOT IN ('','.') 6367 THEN table_variants.INFO 6368 ELSE '' 6369 END 6370 , 6371 CASE WHEN table_variants.INFO NOT IN ('','.') 6372 AND ( 6373 concat({sql_query_annotation_update_info_sets_sql}) 6374 ) 6375 NOT IN ('','.') 6376 THEN ';' 6377 ELSE '' 6378 END 6379 , 6380 {sql_query_annotation_update_info_sets_sql} 6381 ) 6382 {sql_query_annotation_from_clause} 6383 WHERE {sql_query_annotation_where_clause} 6384 ; 6385 """ 6386 6387 # Add update query to dict 6388 query_dict[ 6389 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6390 ] = sql_query_annotation_chrom_interval_pos 6391 6392 nb_of_query = len(query_dict) 6393 num_query = 0 6394 6395 # SET max_expression_depth TO x 6396 self.conn.execute("SET max_expression_depth TO 10000") 6397 6398 for query_name in query_dict: 6399 query = query_dict[query_name] 6400 num_query += 1 6401 log.info( 6402 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6403 ) 6404 result = self.conn.execute(query) 6405 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6406 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6407 log.info( 6408 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6409 ) 6410 6411 log.info( 6412 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6413 ) 6414 6415 else: 6416 6417 log.info( 6418 f"Annotation '{annotation_name}' - No Annotations available" 6419 ) 6420 6421 log.debug("Final header: " + str(vcf_reader.infos)) 6422 6423 # Remove added columns 6424 for added_column in added_columns: 6425 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6427 def annotation_splice(self, threads: int = None) -> None: 6428 """ 6429 This function annotate with snpEff 6430 6431 :param threads: The number of threads to use 6432 :return: the value of the variable "return_value". 6433 """ 6434 6435 # DEBUG 6436 log.debug("Start annotation with splice tools") 6437 6438 # Threads 6439 if not threads: 6440 threads = self.get_threads() 6441 log.debug("Threads: " + str(threads)) 6442 6443 # DEBUG 6444 delete_tmp = True 6445 if self.get_config().get("verbosity", "warning") in ["debug"]: 6446 delete_tmp = False 6447 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6448 6449 # Config 6450 config = self.get_config() 6451 log.debug("Config: " + str(config)) 6452 splice_config = config.get("tools", {}).get("splice", {}) 6453 if not splice_config: 6454 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6455 msg_err = "No Splice tool config" 6456 raise ValueError(msg_err) 6457 log.debug(f"splice_config: {splice_config}") 6458 6459 # Config - Folders - Databases 6460 databases_folders = ( 6461 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6462 ) 6463 log.debug("Databases annotations: " + str(databases_folders)) 6464 6465 # Splice docker image 6466 splice_docker_image = splice_config.get("docker").get("image") 6467 6468 # Pull splice image if it's not already there 6469 if not check_docker_image_exists(splice_docker_image): 6470 log.warning( 6471 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6472 ) 6473 try: 6474 command(f"docker pull {splice_config.get('docker').get('image')}") 6475 except subprocess.CalledProcessError: 6476 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6477 log.error(msg_err) 6478 raise ValueError(msg_err) 6479 6480 # Config - splice databases 6481 splice_databases = ( 6482 config.get("folders", {}) 6483 .get("databases", {}) 6484 .get("splice", DEFAULT_SPLICE_FOLDER) 6485 ) 6486 splice_databases = full_path(splice_databases) 6487 6488 # Param 6489 param = self.get_param() 6490 log.debug("Param: " + str(param)) 6491 6492 # Param 6493 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6494 log.debug("Options: " + str(options)) 6495 6496 # Data 6497 table_variants = self.get_table_variants() 6498 6499 # Check if not empty 6500 log.debug("Check if not empty") 6501 sql_query_chromosomes = ( 6502 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6503 ) 6504 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6505 log.info("VCF empty") 6506 return None 6507 6508 # Export in VCF 6509 log.debug("Create initial file to annotate") 6510 6511 # Create output folder / work folder 6512 if options.get("output_folder", ""): 6513 output_folder = options.get("output_folder", "") 6514 if not os.path.exists(output_folder): 6515 Path(output_folder).mkdir(parents=True, exist_ok=True) 6516 else: 6517 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6518 if not os.path.exists(output_folder): 6519 Path(output_folder).mkdir(parents=True, exist_ok=True) 6520 6521 if options.get("workdir", ""): 6522 workdir = options.get("workdir", "") 6523 else: 6524 workdir = "/work" 6525 6526 # Create tmp VCF file 6527 tmp_vcf = NamedTemporaryFile( 6528 prefix=self.get_prefix(), 6529 dir=output_folder, 6530 suffix=".vcf", 6531 delete=False, 6532 ) 6533 tmp_vcf_name = tmp_vcf.name 6534 6535 # VCF header 6536 header = self.get_header() 6537 6538 # Existing annotations 6539 for vcf_annotation in self.get_header().infos: 6540 6541 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6542 log.debug( 6543 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6544 ) 6545 6546 # Memory limit 6547 if config.get("memory", None): 6548 memory_limit = config.get("memory", "8G").upper() 6549 # upper() 6550 else: 6551 memory_limit = "8G" 6552 log.debug(f"memory_limit: {memory_limit}") 6553 6554 # Check number of variants to annotate 6555 where_clause_regex_spliceai = r"SpliceAI_\w+" 6556 where_clause_regex_spip = r"SPiP_\w+" 6557 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6558 df_list_of_variants_to_annotate = self.get_query_to_df( 6559 query=f""" SELECT * FROM variants {where_clause} """ 6560 ) 6561 if len(df_list_of_variants_to_annotate) == 0: 6562 log.warning( 6563 f"No variants to annotate with splice. Variants probably already annotated with splice" 6564 ) 6565 return None 6566 else: 6567 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6568 6569 # Export VCF file 6570 self.export_variant_vcf( 6571 vcf_file=tmp_vcf_name, 6572 remove_info=True, 6573 add_samples=True, 6574 index=False, 6575 where_clause=where_clause, 6576 ) 6577 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6578 if any(value for value in splice_config.values() if value is None): 6579 log.warning("At least one splice config parameter is empty") 6580 # exit annotation_splice 6581 return None 6582 6583 # Params in splice nf 6584 def check_values(dico: dict): 6585 """ 6586 Ensure parameters for NF splice pipeline 6587 """ 6588 for key, val in dico.items(): 6589 if key == "genome": 6590 if any( 6591 assemb in options.get("genome", {}) 6592 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6593 ): 6594 yield f"--{key} hg19" 6595 elif any( 6596 assemb in options.get("genome", {}) 6597 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6598 ): 6599 yield f"--{key} hg38" 6600 elif ( 6601 (isinstance(val, str) and val) 6602 or isinstance(val, int) 6603 or isinstance(val, bool) 6604 ): 6605 yield f"--{key} {val}" 6606 6607 # Genome 6608 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6609 options["genome"] = genome 6610 # NF params 6611 nf_params = [] 6612 # Add options 6613 if options: 6614 log.debug(options) 6615 nf_params = list(check_values(options)) 6616 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6617 else: 6618 log.debug("No NF params provided") 6619 # Add threads 6620 if "threads" not in options.keys(): 6621 nf_params.append(f"--threads {threads}") 6622 # Genome path 6623 genome_path = find_genome( 6624 config.get("folders", {}) 6625 .get("databases", {}) 6626 .get("genomes", DEFAULT_GENOME_FOLDER), 6627 file=f"{genome}.fa", 6628 ) 6629 # Add genome path 6630 if not genome_path: 6631 raise ValueError( 6632 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6633 ) 6634 else: 6635 log.debug(f"Genome: {genome_path}") 6636 nf_params.append(f"--genome_path {genome_path}") 6637 6638 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6639 """ 6640 Setting up updated databases for SPiP and SpliceAI 6641 """ 6642 6643 try: 6644 6645 # SpliceAI assembly transcriptome 6646 spliceai_assembly = os.path.join( 6647 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6648 options.get("genome"), 6649 "transcriptome", 6650 ) 6651 spip_assembly = options.get("genome") 6652 6653 spip = find( 6654 f"transcriptome_{spip_assembly}.RData", 6655 config.get("folders", {}).get("databases", {}).get("spip", {}), 6656 ) 6657 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6658 log.debug(f"SPiP annotations: {spip}") 6659 log.debug(f"SpliceAI annotations: {spliceai}") 6660 if spip and spliceai: 6661 return [ 6662 f"--spip_transcriptome {spip}", 6663 f"--spliceai_transcriptome {spliceai}", 6664 ] 6665 else: 6666 log.warning( 6667 "Can't find splice databases in configuration, use annotations file from image" 6668 ) 6669 except TypeError: 6670 log.warning( 6671 "Can't find splice databases in configuration, use annotations file from image" 6672 ) 6673 return [] 6674 6675 # Add options, check if transcriptome option have already beend provided 6676 if ( 6677 "spip_transcriptome" not in nf_params 6678 and "spliceai_transcriptome" not in nf_params 6679 ): 6680 splice_reference = splice_annotations(options, config) 6681 if splice_reference: 6682 nf_params.extend(splice_reference) 6683 # nf_params.append(f"--output_folder {output_folder}") 6684 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6685 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6686 log.debug(cmd) 6687 splice_config["docker"]["command"] = cmd 6688 6689 # Ensure proxy is set 6690 proxy = [ 6691 f"-e {var}={os.getenv(var)}" 6692 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6693 if os.getenv(var) is not None 6694 ] 6695 docker_cmd = get_bin_command( 6696 tool="splice", 6697 bin_type="docker", 6698 config=config, 6699 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6700 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6701 ) 6702 # print(docker_cmd) 6703 # exit() 6704 # Docker debug 6705 # if splice_config.get("rm_container"): 6706 # rm_container = "--rm" 6707 # else: 6708 # rm_container = "" 6709 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6710 log.debug(docker_cmd) 6711 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6712 log.debug(res.stdout) 6713 if res.stderr: 6714 log.error(res.stderr) 6715 res.check_returncode() 6716 # Update variants 6717 log.info("Annotation - Updating...") 6718 # Test find output vcf 6719 log.debug( 6720 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6721 ) 6722 output_vcf = [] 6723 # Wrong folder to look in 6724 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6725 if ( 6726 files 6727 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6728 ): 6729 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6730 # log.debug(os.listdir(options.get("output_folder"))) 6731 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6732 if not output_vcf: 6733 log.debug( 6734 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6735 ) 6736 else: 6737 # Get new header from annotated vcf 6738 log.debug(f"Initial header: {len(header.infos)} fields") 6739 # Create new header with splice infos 6740 new_vcf = Variants(input=output_vcf[0]) 6741 new_vcf_header = new_vcf.get_header().infos 6742 for keys, infos in new_vcf_header.items(): 6743 if keys not in header.infos.keys(): 6744 header.infos[keys] = infos 6745 log.debug(f"New header: {len(header.infos)} fields") 6746 log.debug(f"Splice tmp output: {output_vcf[0]}") 6747 self.update_from_vcf(output_vcf[0]) 6748 6749 # Remove file 6750 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6756 def get_config_default(self, name: str) -> dict: 6757 """ 6758 The function `get_config_default` returns a dictionary containing default configurations for 6759 various calculations and prioritizations. 6760 6761 :param name: The `get_config_default` function returns a dictionary containing default 6762 configurations for different calculations and prioritizations. The `name` parameter is used to 6763 specify which specific configuration to retrieve from the dictionary 6764 :type name: str 6765 :return: The function `get_config_default` returns a dictionary containing default configuration 6766 settings for different calculations and prioritizations. The specific configuration settings are 6767 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6768 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6769 returned. If there is no match, an empty dictionary is returned. 6770 """ 6771 6772 config_default = { 6773 "calculations": { 6774 "variant_chr_pos_alt_ref": { 6775 "type": "sql", 6776 "name": "variant_chr_pos_alt_ref", 6777 "description": "Create a variant ID with chromosome, position, alt and ref", 6778 "available": False, 6779 "output_column_name": "variant_chr_pos_alt_ref", 6780 "output_column_type": "String", 6781 "output_column_description": "variant ID with chromosome, position, alt and ref", 6782 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6783 "operation_info": True, 6784 }, 6785 "VARTYPE": { 6786 "type": "sql", 6787 "name": "VARTYPE", 6788 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6789 "available": True, 6790 "table": "variants", 6791 "output_column_name": "VARTYPE", 6792 "output_column_type": "String", 6793 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6794 "operation_query": """ 6795 CASE 6796 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6797 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6798 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6799 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6800 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6801 ELSE 'UNDEFINED' 6802 END 6803 """, 6804 "info_fields": ["SVTYPE"], 6805 "operation_info": True, 6806 }, 6807 "snpeff_hgvs": { 6808 "type": "python", 6809 "name": "snpeff_hgvs", 6810 "description": "HGVS nomenclatures from snpEff annotation", 6811 "available": True, 6812 "function_name": "calculation_extract_snpeff_hgvs", 6813 "function_params": ["snpeff_hgvs", "ANN"], 6814 }, 6815 "snpeff_ann_explode": { 6816 "type": "python", 6817 "name": "snpeff_ann_explode", 6818 "description": "Explode snpEff annotations with uniquify values", 6819 "available": True, 6820 "function_name": "calculation_snpeff_ann_explode", 6821 "function_params": [False, "fields", "snpeff_", "ANN"], 6822 }, 6823 "snpeff_ann_explode_uniquify": { 6824 "type": "python", 6825 "name": "snpeff_ann_explode_uniquify", 6826 "description": "Explode snpEff annotations", 6827 "available": True, 6828 "function_name": "calculation_snpeff_ann_explode", 6829 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6830 }, 6831 "snpeff_ann_explode_json": { 6832 "type": "python", 6833 "name": "snpeff_ann_explode_json", 6834 "description": "Explode snpEff annotations in JSON format", 6835 "available": True, 6836 "function_name": "calculation_snpeff_ann_explode", 6837 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6838 }, 6839 "NOMEN": { 6840 "type": "python", 6841 "name": "NOMEN", 6842 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6843 "available": True, 6844 "function_name": "calculation_extract_nomen", 6845 "function_params": [], 6846 }, 6847 "RENAME_INFO_FIELDS": { 6848 "type": "python", 6849 "name": "RENAME_INFO_FIELDS", 6850 "description": "Rename or remove INFO/tags", 6851 "available": True, 6852 "function_name": "calculation_rename_info_fields", 6853 "function_params": [], 6854 }, 6855 "FINDBYPIPELINE": { 6856 "type": "python", 6857 "name": "FINDBYPIPELINE", 6858 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6859 "available": True, 6860 "function_name": "calculation_find_by_pipeline", 6861 "function_params": ["findbypipeline"], 6862 }, 6863 "FINDBYSAMPLE": { 6864 "type": "python", 6865 "name": "FINDBYSAMPLE", 6866 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6867 "available": True, 6868 "function_name": "calculation_find_by_pipeline", 6869 "function_params": ["findbysample"], 6870 }, 6871 "GENOTYPECONCORDANCE": { 6872 "type": "python", 6873 "name": "GENOTYPECONCORDANCE", 6874 "description": "Concordance of genotype for multi caller VCF", 6875 "available": True, 6876 "function_name": "calculation_genotype_concordance", 6877 "function_params": [], 6878 }, 6879 "BARCODE": { 6880 "type": "python", 6881 "name": "BARCODE", 6882 "description": "BARCODE as VaRank tool", 6883 "available": True, 6884 "function_name": "calculation_barcode", 6885 "function_params": [], 6886 }, 6887 "BARCODEFAMILY": { 6888 "type": "python", 6889 "name": "BARCODEFAMILY", 6890 "description": "BARCODEFAMILY as VaRank tool", 6891 "available": True, 6892 "function_name": "calculation_barcode_family", 6893 "function_params": ["BCF"], 6894 }, 6895 "TRIO": { 6896 "type": "python", 6897 "name": "TRIO", 6898 "description": "Inheritance for a trio family", 6899 "available": True, 6900 "function_name": "calculation_trio", 6901 "function_params": [], 6902 }, 6903 "VAF": { 6904 "type": "python", 6905 "name": "VAF", 6906 "description": "Variant Allele Frequency (VAF) harmonization", 6907 "available": True, 6908 "function_name": "calculation_vaf_normalization", 6909 "function_params": [], 6910 }, 6911 "VAF_stats": { 6912 "type": "python", 6913 "name": "VAF_stats", 6914 "description": "Variant Allele Frequency (VAF) statistics", 6915 "available": True, 6916 "function_name": "calculation_genotype_stats", 6917 "function_params": ["VAF"], 6918 }, 6919 "DP_stats": { 6920 "type": "python", 6921 "name": "DP_stats", 6922 "description": "Depth (DP) statistics", 6923 "available": True, 6924 "function_name": "calculation_genotype_stats", 6925 "function_params": ["DP"], 6926 }, 6927 "variant_id": { 6928 "type": "python", 6929 "name": "variant_id", 6930 "description": "Variant ID generated from variant position and type", 6931 "available": True, 6932 "function_name": "calculation_variant_id", 6933 "function_params": [], 6934 }, 6935 "transcripts_json": { 6936 "type": "python", 6937 "name": "transcripts_json", 6938 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6939 "available": True, 6940 "function_name": "calculation_transcripts_annotation", 6941 "function_params": ["transcripts_json", None], 6942 }, 6943 "transcripts_ann": { 6944 "type": "python", 6945 "name": "transcripts_ann", 6946 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6947 "available": True, 6948 "function_name": "calculation_transcripts_annotation", 6949 "function_params": [None, "transcripts_ann"], 6950 }, 6951 "transcripts_annotations": { 6952 "type": "python", 6953 "name": "transcripts_annotations", 6954 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6955 "available": True, 6956 "function_name": "calculation_transcripts_annotation", 6957 "function_params": [None, None], 6958 }, 6959 "transcripts_prioritization": { 6960 "type": "python", 6961 "name": "transcripts_prioritization", 6962 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6963 "available": True, 6964 "function_name": "calculation_transcripts_prioritization", 6965 "function_params": [], 6966 }, 6967 "transcripts_export": { 6968 "type": "python", 6969 "name": "transcripts_export", 6970 "description": "Export transcripts table/view as a file (using param.json)", 6971 "available": True, 6972 "function_name": "calculation_transcripts_export", 6973 "function_params": [], 6974 }, 6975 }, 6976 "prioritizations": { 6977 "default": { 6978 "ANN2": [ 6979 { 6980 "type": "contains", 6981 "value": "HIGH", 6982 "score": 5, 6983 "flag": "PASS", 6984 "comment": [ 6985 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6986 ], 6987 }, 6988 { 6989 "type": "contains", 6990 "value": "MODERATE", 6991 "score": 3, 6992 "flag": "PASS", 6993 "comment": [ 6994 "A non-disruptive variant that might change protein effectiveness" 6995 ], 6996 }, 6997 { 6998 "type": "contains", 6999 "value": "LOW", 7000 "score": 0, 7001 "flag": "FILTERED", 7002 "comment": [ 7003 "Assumed to be mostly harmless or unlikely to change protein behavior" 7004 ], 7005 }, 7006 { 7007 "type": "contains", 7008 "value": "MODIFIER", 7009 "score": 0, 7010 "flag": "FILTERED", 7011 "comment": [ 7012 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7013 ], 7014 }, 7015 ], 7016 } 7017 }, 7018 } 7019 7020 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
7022 def get_config_json( 7023 self, name: str, config_dict: dict = {}, config_file: str = None 7024 ) -> dict: 7025 """ 7026 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7027 default values, a dictionary, and a file. 7028 7029 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7030 the name of the configuration. It is used to identify and retrieve the configuration settings 7031 for a specific component or module 7032 :type name: str 7033 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7034 dictionary that allows you to provide additional configuration settings or overrides. When you 7035 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7036 the key is the configuration setting you want to override or 7037 :type config_dict: dict 7038 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7039 specify the path to a configuration file that contains additional settings. If provided, the 7040 function will read the contents of this file and update the configuration dictionary with the 7041 values found in the file, overriding any existing values with the 7042 :type config_file: str 7043 :return: The function `get_config_json` returns a dictionary containing the configuration 7044 settings. 7045 """ 7046 7047 # Create with default prioritizations 7048 config_default = self.get_config_default(name=name) 7049 configuration = config_default 7050 # log.debug(f"configuration={configuration}") 7051 7052 # Replace prioritizations from dict 7053 for config in config_dict: 7054 configuration[config] = config_dict[config] 7055 7056 # Replace prioritizations from file 7057 config_file = full_path(config_file) 7058 if config_file: 7059 if os.path.exists(config_file): 7060 with open(config_file) as config_file_content: 7061 config_file_dict = yaml.safe_load(config_file_content) 7062 for config in config_file_dict: 7063 configuration[config] = config_file_dict[config] 7064 else: 7065 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7066 log.error(msg_error) 7067 raise ValueError(msg_error) 7068 7069 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7071 def prioritization( 7072 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7073 ) -> bool: 7074 """ 7075 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7076 prioritizes variants based on configured profiles and criteria. 7077 7078 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7079 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7080 a table name is provided, the method will prioritize the variants in that specific table 7081 :type table: str 7082 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7083 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7084 provided, the code will use a default prefix value of "PZ" 7085 :type pz_prefix: str 7086 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7087 additional parameters specific to the prioritization process. These parameters can include 7088 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7089 configurations needed for the prioritization of variants in a V 7090 :type pz_param: dict 7091 :return: A boolean value (True) is being returned from the `prioritization` function. 7092 """ 7093 7094 # Config 7095 config = self.get_config() 7096 7097 # Param 7098 param = self.get_param() 7099 7100 # Prioritization param 7101 if pz_param is not None: 7102 prioritization_param = pz_param 7103 else: 7104 prioritization_param = param.get("prioritization", {}) 7105 7106 # Configuration profiles 7107 prioritization_config_file = prioritization_param.get( 7108 "prioritization_config", None 7109 ) 7110 prioritization_config_file = full_path(prioritization_config_file) 7111 prioritizations_config = self.get_config_json( 7112 name="prioritizations", config_file=prioritization_config_file 7113 ) 7114 7115 # Prioritization prefix 7116 pz_prefix_default = "PZ" 7117 if pz_prefix is None: 7118 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7119 7120 # Prioritization options 7121 profiles = prioritization_param.get("profiles", []) 7122 if isinstance(profiles, str): 7123 profiles = profiles.split(",") 7124 pzfields = prioritization_param.get( 7125 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7126 ) 7127 if isinstance(pzfields, str): 7128 pzfields = pzfields.split(",") 7129 default_profile = prioritization_param.get("default_profile", None) 7130 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7131 prioritization_score_mode = prioritization_param.get( 7132 "prioritization_score_mode", "HOWARD" 7133 ) 7134 7135 # Quick Prioritizations 7136 prioritizations = param.get("prioritizations", None) 7137 if prioritizations: 7138 log.info("Quick Prioritization:") 7139 for profile in prioritizations.split(","): 7140 if profile not in profiles: 7141 profiles.append(profile) 7142 log.info(f" {profile}") 7143 7144 # If profile "ALL" provided, all profiles in the config profiles 7145 if "ALL" in profiles: 7146 profiles = list(prioritizations_config.keys()) 7147 7148 for profile in profiles: 7149 if prioritizations_config.get(profile, None): 7150 log.debug(f"Profile '{profile}' configured") 7151 else: 7152 msg_error = f"Profile '{profile}' NOT configured" 7153 log.error(msg_error) 7154 raise ValueError(msg_error) 7155 7156 if profiles: 7157 log.info(f"Prioritization... ") 7158 else: 7159 log.debug(f"No profile defined") 7160 return False 7161 7162 if not default_profile and len(profiles): 7163 default_profile = profiles[0] 7164 7165 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7166 log.debug("Profiles to check: " + str(list(profiles))) 7167 7168 # Variables 7169 if table is not None: 7170 table_variants = table 7171 else: 7172 table_variants = self.get_table_variants(clause="update") 7173 log.debug(f"Table to prioritize: {table_variants}") 7174 7175 # Added columns 7176 added_columns = [] 7177 7178 # Create list of PZfields 7179 # List of PZFields 7180 list_of_pzfields_original = pzfields + [ 7181 pzfield + pzfields_sep + profile 7182 for pzfield in pzfields 7183 for profile in profiles 7184 ] 7185 list_of_pzfields = [] 7186 log.debug(f"{list_of_pzfields_original}") 7187 7188 # Remove existing PZfields to use if exists 7189 for pzfield in list_of_pzfields_original: 7190 if self.get_header().infos.get(pzfield, None) is None: 7191 list_of_pzfields.append(pzfield) 7192 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7193 else: 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7195 7196 if list_of_pzfields: 7197 7198 # Explode Infos prefix 7199 explode_infos_prefix = self.get_explode_infos_prefix() 7200 7201 # PZfields tags description 7202 PZfields_INFOS = { 7203 f"{pz_prefix}Tags": { 7204 "ID": f"{pz_prefix}Tags", 7205 "Number": ".", 7206 "Type": "String", 7207 "Description": "Variant tags based on annotation criteria", 7208 }, 7209 f"{pz_prefix}Score": { 7210 "ID": f"{pz_prefix}Score", 7211 "Number": 1, 7212 "Type": "Integer", 7213 "Description": "Variant score based on annotation criteria", 7214 }, 7215 f"{pz_prefix}Flag": { 7216 "ID": f"{pz_prefix}Flag", 7217 "Number": 1, 7218 "Type": "String", 7219 "Description": "Variant flag based on annotation criteria", 7220 }, 7221 f"{pz_prefix}Comment": { 7222 "ID": f"{pz_prefix}Comment", 7223 "Number": ".", 7224 "Type": "String", 7225 "Description": "Variant comment based on annotation criteria", 7226 }, 7227 f"{pz_prefix}Infos": { 7228 "ID": f"{pz_prefix}Infos", 7229 "Number": ".", 7230 "Type": "String", 7231 "Description": "Variant infos based on annotation criteria", 7232 }, 7233 f"{pz_prefix}Class": { 7234 "ID": f"{pz_prefix}Class", 7235 "Number": ".", 7236 "Type": "String", 7237 "Description": "Variant class based on annotation criteria", 7238 }, 7239 } 7240 7241 # Create INFO fields if not exist 7242 for field in PZfields_INFOS: 7243 field_ID = PZfields_INFOS[field]["ID"] 7244 field_description = PZfields_INFOS[field]["Description"] 7245 if field_ID not in self.get_header().infos and field_ID in pzfields: 7246 field_description = ( 7247 PZfields_INFOS[field]["Description"] 7248 + f", profile {default_profile}" 7249 ) 7250 self.get_header().infos[field_ID] = vcf.parser._Info( 7251 field_ID, 7252 PZfields_INFOS[field]["Number"], 7253 PZfields_INFOS[field]["Type"], 7254 field_description, 7255 "unknown", 7256 "unknown", 7257 code_type_map[PZfields_INFOS[field]["Type"]], 7258 ) 7259 7260 # Create INFO fields if not exist for each profile 7261 for profile in prioritizations_config: 7262 if profile in profiles or profiles == []: 7263 for field in PZfields_INFOS: 7264 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7265 field_description = ( 7266 PZfields_INFOS[field]["Description"] 7267 + f", profile {profile}" 7268 ) 7269 if ( 7270 field_ID not in self.get_header().infos 7271 and field in pzfields 7272 ): 7273 self.get_header().infos[field_ID] = vcf.parser._Info( 7274 field_ID, 7275 PZfields_INFOS[field]["Number"], 7276 PZfields_INFOS[field]["Type"], 7277 field_description, 7278 "unknown", 7279 "unknown", 7280 code_type_map[PZfields_INFOS[field]["Type"]], 7281 ) 7282 7283 # Header 7284 for pzfield in list_of_pzfields: 7285 if re.match(f"{pz_prefix}Score.*", pzfield): 7286 added_column = self.add_column( 7287 table_name=table_variants, 7288 column_name=pzfield, 7289 column_type="INTEGER", 7290 default_value="0", 7291 ) 7292 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7293 added_column = self.add_column( 7294 table_name=table_variants, 7295 column_name=pzfield, 7296 column_type="BOOLEAN", 7297 default_value="1", 7298 ) 7299 elif re.match(f"{pz_prefix}Class.*", pzfield): 7300 added_column = self.add_column( 7301 table_name=table_variants, 7302 column_name=pzfield, 7303 column_type="VARCHAR[]", 7304 default_value="null", 7305 ) 7306 else: 7307 added_column = self.add_column( 7308 table_name=table_variants, 7309 column_name=pzfield, 7310 column_type="STRING", 7311 default_value="''", 7312 ) 7313 added_columns.append(added_column) 7314 7315 # Profiles 7316 if profiles: 7317 7318 # foreach profile in configuration file 7319 for profile in prioritizations_config: 7320 7321 # If profile is asked in param, or ALL are asked (empty profile []) 7322 if profile in profiles or profiles == []: 7323 log.info(f"Profile '{profile}'") 7324 7325 sql_set_info_option = "" 7326 7327 sql_set_info = [] 7328 7329 # PZ fields set 7330 7331 # PZScore 7332 if ( 7333 f"{pz_prefix}Score{pzfields_sep}{profile}" 7334 in list_of_pzfields 7335 ): 7336 sql_set_info.append( 7337 f""" 7338 concat( 7339 '{pz_prefix}Score{pzfields_sep}{profile}=', 7340 {pz_prefix}Score{pzfields_sep}{profile} 7341 ) 7342 """ 7343 ) 7344 if ( 7345 profile == default_profile 7346 and f"{pz_prefix}Score" in list_of_pzfields 7347 ): 7348 sql_set_info.append( 7349 f""" 7350 concat( 7351 '{pz_prefix}Score=', 7352 {pz_prefix}Score{pzfields_sep}{profile} 7353 ) 7354 """ 7355 ) 7356 7357 # PZFlag 7358 if ( 7359 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7360 in list_of_pzfields 7361 ): 7362 sql_set_info.append( 7363 f""" 7364 concat( 7365 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7366 CASE 7367 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7368 THEN 'PASS' 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7370 THEN 'FILTERED' 7371 END 7372 ) 7373 """ 7374 ) 7375 if ( 7376 profile == default_profile 7377 and f"{pz_prefix}Flag" in list_of_pzfields 7378 ): 7379 sql_set_info.append( 7380 f""" 7381 concat( 7382 '{pz_prefix}Flag=', 7383 CASE 7384 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7385 THEN 'PASS' 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7387 THEN 'FILTERED' 7388 END 7389 ) 7390 """ 7391 ) 7392 7393 # PZClass 7394 if ( 7395 f"{pz_prefix}Class{pzfields_sep}{profile}" 7396 in list_of_pzfields 7397 ): 7398 sql_set_info.append( 7399 f""" 7400 concat( 7401 '{pz_prefix}Class{pzfields_sep}{profile}=', 7402 CASE 7403 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7404 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7405 ELSE '.' 7406 END 7407 ) 7408 7409 """ 7410 ) 7411 if ( 7412 profile == default_profile 7413 and f"{pz_prefix}Class" in list_of_pzfields 7414 ): 7415 sql_set_info.append( 7416 f""" 7417 concat( 7418 '{pz_prefix}Class=', 7419 CASE 7420 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7421 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7422 ELSE '.' 7423 END 7424 ) 7425 """ 7426 ) 7427 7428 # PZComment 7429 if ( 7430 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7431 in list_of_pzfields 7432 ): 7433 sql_set_info.append( 7434 f""" 7435 CASE 7436 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7437 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7438 ELSE '' 7439 END 7440 """ 7441 ) 7442 if ( 7443 profile == default_profile 7444 and f"{pz_prefix}Comment" in list_of_pzfields 7445 ): 7446 sql_set_info.append( 7447 f""" 7448 CASE 7449 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7450 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7451 ELSE '' 7452 END 7453 """ 7454 ) 7455 7456 # PZInfos 7457 if ( 7458 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7459 in list_of_pzfields 7460 ): 7461 sql_set_info.append( 7462 f""" 7463 CASE 7464 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7465 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7466 ELSE '' 7467 END 7468 """ 7469 ) 7470 if ( 7471 profile == default_profile 7472 and f"{pz_prefix}Infos" in list_of_pzfields 7473 ): 7474 sql_set_info.append( 7475 f""" 7476 CASE 7477 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7478 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7479 ELSE '' 7480 END 7481 """ 7482 ) 7483 7484 # Merge PZfields 7485 sql_set_info_option = "" 7486 sql_set_sep = "" 7487 for sql_set in sql_set_info: 7488 if sql_set_sep: 7489 sql_set_info_option += f""" 7490 , concat('{sql_set_sep}', {sql_set}) 7491 """ 7492 else: 7493 sql_set_info_option += f""" 7494 , {sql_set} 7495 """ 7496 sql_set_sep = ";" 7497 7498 sql_queries = [] 7499 for annotation in prioritizations_config[profile]: 7500 7501 # skip special sections 7502 if annotation.startswith("_"): 7503 continue 7504 7505 # For each criterions 7506 for criterion in prioritizations_config[profile][ 7507 annotation 7508 ]: 7509 7510 # Criterion mode 7511 criterion_mode = None 7512 if np.any( 7513 np.isin(list(criterion.keys()), ["type", "value"]) 7514 ): 7515 criterion_mode = "operation" 7516 elif np.any( 7517 np.isin(list(criterion.keys()), ["sql", "fields"]) 7518 ): 7519 criterion_mode = "sql" 7520 log.debug(f"Criterion Mode: {criterion_mode}") 7521 7522 # Criterion parameters 7523 criterion_type = criterion.get("type", None) 7524 criterion_value = criterion.get("value", None) 7525 criterion_sql = criterion.get("sql", None) 7526 criterion_fields = criterion.get("fields", None) 7527 criterion_score = criterion.get("score", 0) 7528 criterion_flag = criterion.get("flag", "PASS") 7529 criterion_class = criterion.get("class", None) 7530 criterion_flag_bool = criterion_flag == "PASS" 7531 criterion_comment = ( 7532 ", ".join(criterion.get("comment", [])) 7533 .replace("'", "''") 7534 .replace(";", ",") 7535 .replace("\t", " ") 7536 ) 7537 criterion_infos = ( 7538 str(criterion) 7539 .replace("'", "''") 7540 .replace(";", ",") 7541 .replace("\t", " ") 7542 ) 7543 7544 # SQL 7545 if criterion_sql is not None and isinstance( 7546 criterion_sql, list 7547 ): 7548 criterion_sql = " ".join(criterion_sql) 7549 7550 # Fields and explode 7551 if criterion_fields is None: 7552 criterion_fields = [annotation] 7553 if not isinstance(criterion_fields, list): 7554 criterion_fields = str(criterion_fields).split(",") 7555 7556 # Class 7557 if criterion_class is not None and not isinstance( 7558 criterion_class, list 7559 ): 7560 criterion_class = str(criterion_class).split(",") 7561 7562 for annotation_field in criterion_fields: 7563 7564 # Explode specific annotation 7565 log.debug( 7566 f"Explode annotation '{annotation_field}'" 7567 ) 7568 added_columns += self.explode_infos( 7569 prefix=explode_infos_prefix, 7570 fields=[annotation_field], 7571 table=table_variants, 7572 ) 7573 extra_infos = self.get_extra_infos( 7574 table=table_variants 7575 ) 7576 7577 # Check if annotation field is present 7578 if ( 7579 f"{explode_infos_prefix}{annotation_field}" 7580 not in extra_infos 7581 ): 7582 msq_err = f"Annotation '{annotation_field}' not in data" 7583 log.error(msq_err) 7584 raise ValueError(msq_err) 7585 else: 7586 log.debug( 7587 f"Annotation '{annotation_field}' in data" 7588 ) 7589 7590 sql_set = [] 7591 sql_set_info = [] 7592 7593 # PZ fields set 7594 7595 # PZScore 7596 if ( 7597 f"{pz_prefix}Score{pzfields_sep}{profile}" 7598 in list_of_pzfields 7599 ): 7600 # VaRank prioritization score mode 7601 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7602 sql_set.append( 7603 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7604 ) 7605 # default HOWARD prioritization score mode 7606 else: 7607 sql_set.append( 7608 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7609 ) 7610 7611 # PZFlag 7612 if ( 7613 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 ): 7616 sql_set.append( 7617 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7618 ) 7619 7620 # PZClass 7621 if ( 7622 f"{pz_prefix}Class{pzfields_sep}{profile}" 7623 in list_of_pzfields 7624 and criterion_class is not None 7625 ): 7626 sql_set.append( 7627 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7628 ) 7629 7630 # PZComment 7631 if ( 7632 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7633 in list_of_pzfields 7634 ): 7635 sql_set.append( 7636 f""" 7637 {pz_prefix}Comment{pzfields_sep}{profile} = 7638 concat( 7639 {pz_prefix}Comment{pzfields_sep}{profile}, 7640 CASE 7641 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7642 THEN ', ' 7643 ELSE '' 7644 END, 7645 '{criterion_comment}' 7646 ) 7647 """ 7648 ) 7649 7650 # PZInfos 7651 if ( 7652 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7653 in list_of_pzfields 7654 ): 7655 sql_set.append( 7656 f""" 7657 {pz_prefix}Infos{pzfields_sep}{profile} = 7658 concat( 7659 {pz_prefix}Infos{pzfields_sep}{profile}, 7660 '{criterion_infos}' 7661 ) 7662 """ 7663 ) 7664 sql_set_option = ",".join(sql_set) 7665 7666 # Criterion and comparison 7667 if sql_set_option: 7668 7669 if criterion_mode in ["operation"]: 7670 7671 try: 7672 float(criterion_value) 7673 sql_update = f""" 7674 UPDATE {table_variants} 7675 SET {sql_set_option} 7676 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7677 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7678 """ 7679 except: 7680 contains_option = "" 7681 if criterion_type == "contains": 7682 contains_option = ".*" 7683 sql_update = f""" 7684 UPDATE {table_variants} 7685 SET {sql_set_option} 7686 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7687 """ 7688 sql_queries.append(sql_update) 7689 7690 elif criterion_mode in ["sql"]: 7691 7692 sql_update = f""" 7693 UPDATE {table_variants} 7694 SET {sql_set_option} 7695 WHERE {criterion_sql} 7696 """ 7697 sql_queries.append(sql_update) 7698 7699 else: 7700 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7701 log.error(msg_err) 7702 raise ValueError(msg_err) 7703 7704 else: 7705 log.warning( 7706 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7707 ) 7708 7709 # PZTags 7710 if ( 7711 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7712 in list_of_pzfields 7713 ): 7714 7715 # Create PZFalgs value 7716 pztags_value = "" 7717 pztags_sep_default = "," 7718 pztags_sep = "" 7719 for pzfield in pzfields: 7720 if pzfield not in [f"{pz_prefix}Tags"]: 7721 if ( 7722 f"{pzfield}{pzfields_sep}{profile}" 7723 in list_of_pzfields 7724 ): 7725 if pzfield in [f"{pz_prefix}Flag"]: 7726 pztags_value += f"""{pztags_sep}{pzfield}#', 7727 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7728 THEN 'PASS' 7729 ELSE 'FILTERED' 7730 END, '""" 7731 elif pzfield in [f"{pz_prefix}Class"]: 7732 pztags_value += f"""{pztags_sep}{pzfield}#', 7733 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7734 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7735 ELSE '.' 7736 END, '""" 7737 else: 7738 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7739 pztags_sep = pztags_sep_default 7740 7741 # Add Query update for PZFlags 7742 sql_update_pztags = f""" 7743 UPDATE {table_variants} 7744 SET INFO = concat( 7745 INFO, 7746 CASE WHEN INFO NOT in ('','.') 7747 THEN ';' 7748 ELSE '' 7749 END, 7750 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7751 ) 7752 """ 7753 sql_queries.append(sql_update_pztags) 7754 7755 # Add Query update for PZFlags for default 7756 if profile == default_profile: 7757 sql_update_pztags_default = f""" 7758 UPDATE {table_variants} 7759 SET INFO = concat( 7760 INFO, 7761 ';', 7762 '{pz_prefix}Tags={pztags_value}' 7763 ) 7764 """ 7765 sql_queries.append(sql_update_pztags_default) 7766 7767 log.info(f"""Profile '{profile}' - Prioritization... """) 7768 7769 if sql_queries: 7770 7771 for sql_query in sql_queries: 7772 log.debug( 7773 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7774 ) 7775 self.conn.execute(sql_query) 7776 7777 log.info(f"""Profile '{profile}' - Update... """) 7778 sql_query_update = f""" 7779 UPDATE {table_variants} 7780 SET INFO = 7781 concat( 7782 CASE 7783 WHEN INFO NOT IN ('','.') 7784 THEN concat(INFO, ';') 7785 ELSE '' 7786 END 7787 {sql_set_info_option} 7788 ) 7789 """ 7790 self.conn.execute(sql_query_update) 7791 7792 else: 7793 7794 log.warning(f"No profiles in parameters") 7795 7796 # Remove added columns 7797 for added_column in added_columns: 7798 self.drop_column(column=added_column) 7799 7800 # Explode INFOS fields into table fields 7801 if self.get_explode_infos(): 7802 self.explode_infos( 7803 prefix=self.get_explode_infos_prefix(), 7804 fields=self.get_explode_infos_fields(), 7805 force=True, 7806 ) 7807 7808 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7814 def annotation_hgvs(self, threads: int = None) -> None: 7815 """ 7816 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7817 coordinates and alleles. 7818 7819 :param threads: The `threads` parameter is an optional integer that specifies the number of 7820 threads to use for parallel processing. If no value is provided, it will default to the number 7821 of threads obtained from the `get_threads()` method 7822 :type threads: int 7823 """ 7824 7825 # Function for each partition of the Dask Dataframe 7826 def partition_function(partition): 7827 """ 7828 The function `partition_function` applies the `annotation_hgvs_partition` function to 7829 each row of a DataFrame called `partition`. 7830 7831 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7832 to be processed 7833 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7834 the "partition" dataframe along the axis 1. 7835 """ 7836 return partition.apply(annotation_hgvs_partition, axis=1) 7837 7838 def annotation_hgvs_partition(row) -> str: 7839 """ 7840 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7841 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7842 7843 :param row: A dictionary-like object that contains the values for the following keys: 7844 :return: a string that contains the HGVS names associated with the given row of data. 7845 """ 7846 7847 chr = row["CHROM"] 7848 pos = row["POS"] 7849 ref = row["REF"] 7850 alt = row["ALT"] 7851 7852 # Find list of associated transcripts 7853 transcripts_list = list( 7854 polars_conn.execute( 7855 f""" 7856 SELECT transcript 7857 FROM refseq_df 7858 WHERE CHROM='{chr}' 7859 AND POS={pos} 7860 """ 7861 )["transcript"] 7862 ) 7863 7864 # Full HGVS annotation in list 7865 hgvs_full_list = [] 7866 7867 for transcript_name in transcripts_list: 7868 7869 # Transcript 7870 transcript = get_transcript( 7871 transcripts=transcripts, transcript_name=transcript_name 7872 ) 7873 # Exon 7874 if use_exon: 7875 exon = transcript.find_exon_number(pos) 7876 else: 7877 exon = None 7878 # Protein 7879 transcript_protein = None 7880 if use_protein or add_protein or full_format: 7881 transcripts_protein = list( 7882 polars_conn.execute( 7883 f""" 7884 SELECT protein 7885 FROM refseqlink_df 7886 WHERE transcript='{transcript_name}' 7887 LIMIT 1 7888 """ 7889 )["protein"] 7890 ) 7891 if len(transcripts_protein): 7892 transcript_protein = transcripts_protein[0] 7893 7894 # HGVS name 7895 hgvs_name = format_hgvs_name( 7896 chr, 7897 pos, 7898 ref, 7899 alt, 7900 genome=genome, 7901 transcript=transcript, 7902 transcript_protein=transcript_protein, 7903 exon=exon, 7904 use_gene=use_gene, 7905 use_protein=use_protein, 7906 full_format=full_format, 7907 use_version=use_version, 7908 codon_type=codon_type, 7909 ) 7910 hgvs_full_list.append(hgvs_name) 7911 if add_protein and not use_protein and not full_format: 7912 hgvs_name = format_hgvs_name( 7913 chr, 7914 pos, 7915 ref, 7916 alt, 7917 genome=genome, 7918 transcript=transcript, 7919 transcript_protein=transcript_protein, 7920 exon=exon, 7921 use_gene=use_gene, 7922 use_protein=True, 7923 full_format=False, 7924 use_version=use_version, 7925 codon_type=codon_type, 7926 ) 7927 hgvs_full_list.append(hgvs_name) 7928 7929 # Create liste of HGVS annotations 7930 hgvs_full = ",".join(hgvs_full_list) 7931 7932 return hgvs_full 7933 7934 # Polars connexion 7935 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7936 7937 # Config 7938 config = self.get_config() 7939 7940 # Databases 7941 # Genome 7942 databases_genomes_folders = ( 7943 config.get("folders", {}) 7944 .get("databases", {}) 7945 .get("genomes", DEFAULT_GENOME_FOLDER) 7946 ) 7947 databases_genome = ( 7948 config.get("folders", {}).get("databases", {}).get("genomes", "") 7949 ) 7950 # refseq database folder 7951 databases_refseq_folders = ( 7952 config.get("folders", {}) 7953 .get("databases", {}) 7954 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7955 ) 7956 # refseq 7957 databases_refseq = config.get("databases", {}).get("refSeq", None) 7958 # refSeqLink 7959 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7960 7961 # Param 7962 param = self.get_param() 7963 7964 # Quick HGVS 7965 if "hgvs_options" in param and param.get("hgvs_options", ""): 7966 log.info(f"Quick HGVS Annotation:") 7967 if not param.get("hgvs", None): 7968 param["hgvs"] = {} 7969 for option in param.get("hgvs_options", "").split(","): 7970 option_var_val = option.split("=") 7971 option_var = option_var_val[0] 7972 if len(option_var_val) > 1: 7973 option_val = option_var_val[1] 7974 else: 7975 option_val = "True" 7976 if option_val.upper() in ["TRUE"]: 7977 option_val = True 7978 elif option_val.upper() in ["FALSE"]: 7979 option_val = False 7980 log.info(f" {option_var}={option_val}") 7981 param["hgvs"][option_var] = option_val 7982 7983 # Check if HGVS annotation enabled 7984 if "hgvs" in param: 7985 log.info(f"HGVS Annotation... ") 7986 for hgvs_option in param.get("hgvs", {}): 7987 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7988 else: 7989 return 7990 7991 # HGVS Param 7992 param_hgvs = param.get("hgvs", {}) 7993 use_exon = param_hgvs.get("use_exon", False) 7994 use_gene = param_hgvs.get("use_gene", False) 7995 use_protein = param_hgvs.get("use_protein", False) 7996 add_protein = param_hgvs.get("add_protein", False) 7997 full_format = param_hgvs.get("full_format", False) 7998 use_version = param_hgvs.get("use_version", False) 7999 codon_type = param_hgvs.get("codon_type", "3") 8000 8001 # refSseq refSeqLink 8002 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8003 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8004 8005 # Assembly 8006 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8007 8008 # Genome 8009 genome_file = None 8010 if find_genome(databases_genome): 8011 genome_file = find_genome(databases_genome) 8012 else: 8013 genome_file = find_genome( 8014 genome_path=databases_genomes_folders, assembly=assembly 8015 ) 8016 log.debug("Genome: " + str(genome_file)) 8017 8018 # refSseq 8019 refseq_file = find_file_prefix( 8020 input_file=databases_refseq, 8021 prefix="ncbiRefSeq", 8022 folder=databases_refseq_folders, 8023 assembly=assembly, 8024 ) 8025 log.debug("refSeq: " + str(refseq_file)) 8026 8027 # refSeqLink 8028 refseqlink_file = find_file_prefix( 8029 input_file=databases_refseqlink, 8030 prefix="ncbiRefSeqLink", 8031 folder=databases_refseq_folders, 8032 assembly=assembly, 8033 ) 8034 log.debug("refSeqLink: " + str(refseqlink_file)) 8035 8036 # Threads 8037 if not threads: 8038 threads = self.get_threads() 8039 log.debug("Threads: " + str(threads)) 8040 8041 # Variables 8042 table_variants = self.get_table_variants(clause="update") 8043 8044 # Get variants SNV and InDel only 8045 query_variants = f""" 8046 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8047 FROM {table_variants} 8048 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8049 """ 8050 df_variants = self.get_query_to_df(query_variants) 8051 8052 # Added columns 8053 added_columns = [] 8054 8055 # Add hgvs column in variants table 8056 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8057 added_column = self.add_column( 8058 table_variants, hgvs_column_name, "STRING", default_value=None 8059 ) 8060 added_columns.append(added_column) 8061 8062 log.debug(f"refSeq loading...") 8063 # refSeq in duckDB 8064 refseq_table = get_refseq_table( 8065 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8066 ) 8067 # Loading all refSeq in Dataframe 8068 refseq_query = f""" 8069 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8070 FROM {refseq_table} 8071 JOIN df_variants ON ( 8072 {refseq_table}.chrom = df_variants.CHROM 8073 AND {refseq_table}.txStart<=df_variants.POS 8074 AND {refseq_table}.txEnd>=df_variants.POS 8075 ) 8076 """ 8077 refseq_df = self.conn.query(refseq_query).pl() 8078 8079 if refseqlink_file: 8080 log.debug(f"refSeqLink loading...") 8081 # refSeqLink in duckDB 8082 refseqlink_table = get_refseq_table( 8083 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8084 ) 8085 # Loading all refSeqLink in Dataframe 8086 protacc_column = "protAcc_with_ver" 8087 mrnaacc_column = "mrnaAcc_with_ver" 8088 refseqlink_query = f""" 8089 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8090 FROM {refseqlink_table} 8091 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8092 WHERE protAcc_without_ver IS NOT NULL 8093 """ 8094 # Polars Dataframe 8095 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8096 8097 # Read RefSeq transcripts into a python dict/model. 8098 log.debug(f"Transcripts loading...") 8099 with tempfile.TemporaryDirectory() as tmpdir: 8100 transcripts_query = f""" 8101 COPY ( 8102 SELECT {refseq_table}.* 8103 FROM {refseq_table} 8104 JOIN df_variants ON ( 8105 {refseq_table}.chrom=df_variants.CHROM 8106 AND {refseq_table}.txStart<=df_variants.POS 8107 AND {refseq_table}.txEnd>=df_variants.POS 8108 ) 8109 ) 8110 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8111 """ 8112 self.conn.query(transcripts_query) 8113 with open(f"{tmpdir}/transcript.tsv") as infile: 8114 transcripts = read_transcripts(infile) 8115 8116 # Polars connexion 8117 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8118 8119 log.debug("Genome loading...") 8120 # Read genome sequence using pyfaidx. 8121 genome = Fasta(genome_file) 8122 8123 log.debug("Start annotation HGVS...") 8124 8125 # Create 8126 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8127 ddf = dd.from_pandas(df_variants, npartitions=threads) 8128 8129 # Use dask.dataframe.apply() to apply function on each partition 8130 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8131 8132 # Convert Dask DataFrame to Pandas Dataframe 8133 df = ddf.compute() 8134 8135 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8136 with tempfile.TemporaryDirectory() as tmpdir: 8137 df_parquet = os.path.join(tmpdir, "df.parquet") 8138 df.to_parquet(df_parquet) 8139 8140 # Update hgvs column 8141 update_variant_query = f""" 8142 UPDATE {table_variants} 8143 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8144 FROM read_parquet('{df_parquet}') as df 8145 WHERE variants."#CHROM" = df.CHROM 8146 AND variants.POS = df.POS 8147 AND variants.REF = df.REF 8148 AND variants.ALT = df.ALT 8149 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8150 """ 8151 self.execute_query(update_variant_query) 8152 8153 # Update INFO column 8154 sql_query_update = f""" 8155 UPDATE {table_variants} 8156 SET INFO = 8157 concat( 8158 CASE 8159 WHEN INFO NOT IN ('','.') 8160 THEN concat(INFO, ';') 8161 ELSE '' 8162 END, 8163 'hgvs=', 8164 {hgvs_column_name} 8165 ) 8166 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8167 """ 8168 self.execute_query(sql_query_update) 8169 8170 # Add header 8171 HGVS_INFOS = { 8172 "hgvs": { 8173 "ID": "hgvs", 8174 "Number": ".", 8175 "Type": "String", 8176 "Description": f"HGVS annotatation with HOWARD", 8177 } 8178 } 8179 8180 for field in HGVS_INFOS: 8181 field_ID = HGVS_INFOS[field]["ID"] 8182 field_description = HGVS_INFOS[field]["Description"] 8183 self.get_header().infos[field_ID] = vcf.parser._Info( 8184 field_ID, 8185 HGVS_INFOS[field]["Number"], 8186 HGVS_INFOS[field]["Type"], 8187 field_description, 8188 "unknown", 8189 "unknown", 8190 code_type_map[HGVS_INFOS[field]["Type"]], 8191 ) 8192 8193 # Remove added columns 8194 for added_column in added_columns: 8195 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8201 def get_operations_help( 8202 self, operations_config_dict: dict = {}, operations_config_file: str = None 8203 ) -> list: 8204 8205 # Init 8206 operations_help = [] 8207 8208 # operations 8209 operations = self.get_config_json( 8210 name="calculations", 8211 config_dict=operations_config_dict, 8212 config_file=operations_config_file, 8213 ) 8214 for op in operations: 8215 op_name = operations[op].get("name", op).upper() 8216 op_description = operations[op].get("description", op_name) 8217 op_available = operations[op].get("available", False) 8218 if op_available: 8219 operations_help.append(f" {op_name}: {op_description}") 8220 8221 # Sort operations 8222 operations_help.sort() 8223 8224 # insert header 8225 operations_help.insert(0, "Available calculation operations:") 8226 8227 # Return 8228 return operations_help
8230 def calculation( 8231 self, 8232 operations: dict = {}, 8233 operations_config_dict: dict = {}, 8234 operations_config_file: str = None, 8235 ) -> None: 8236 """ 8237 It takes a list of operations, and for each operation, it checks if it's a python or sql 8238 operation, and then calls the appropriate function 8239 8240 param json example: 8241 "calculation": { 8242 "NOMEN": { 8243 "options": { 8244 "hgvs_field": "hgvs" 8245 }, 8246 "middle" : null 8247 } 8248 """ 8249 8250 # Param 8251 param = self.get_param() 8252 8253 # CHeck operations config file 8254 if operations_config_file is None: 8255 operations_config_file = param.get("calculation", {}).get( 8256 "calculation_config", None 8257 ) 8258 8259 # operations config 8260 operations_config = self.get_config_json( 8261 name="calculations", 8262 config_dict=operations_config_dict, 8263 config_file=operations_config_file, 8264 ) 8265 8266 # Upper keys 8267 operations_config = {k.upper(): v for k, v in operations_config.items()} 8268 8269 # Calculations 8270 8271 # Operations from param 8272 operations = param.get("calculation", {}).get("calculations", operations) 8273 8274 # Quick calculation - add 8275 if param.get("calculations", None): 8276 8277 # List of operations 8278 calculations_list = [ 8279 value.strip() for value in param.get("calculations", "").split(",") 8280 ] 8281 8282 # Log 8283 log.info(f"Quick Calculations:") 8284 for calculation_key in calculations_list: 8285 log.info(f" {calculation_key}") 8286 8287 # Create tmp operations (to keep operation order) 8288 operations_tmp = {} 8289 for calculation_operation in calculations_list: 8290 if calculation_operation.upper() not in operations_tmp: 8291 log.debug( 8292 f"{calculation_operation}.upper() not in {operations_tmp}" 8293 ) 8294 operations_tmp[calculation_operation.upper()] = {} 8295 add_value_into_dict( 8296 dict_tree=operations_tmp, 8297 sections=[ 8298 calculation_operation.upper(), 8299 ], 8300 value=operations.get(calculation_operation.upper(), {}), 8301 ) 8302 # Add operations already in param 8303 for calculation_operation in operations: 8304 if calculation_operation not in operations_tmp: 8305 operations_tmp[calculation_operation] = operations.get( 8306 calculation_operation, {} 8307 ) 8308 8309 # Update operations in param 8310 operations = operations_tmp 8311 8312 # Operations for calculation 8313 if not operations: 8314 operations = param.get("calculation", {}).get("calculations", {}) 8315 8316 if operations: 8317 log.info(f"Calculations...") 8318 8319 # For each operations 8320 for operation_name in operations: 8321 operation_name = operation_name.upper() 8322 if operation_name not in [""]: 8323 if operation_name in operations_config: 8324 log.info(f"Calculation '{operation_name}'") 8325 operation = operations_config[operation_name] 8326 operation_type = operation.get("type", "sql") 8327 if operation_type == "python": 8328 self.calculation_process_function( 8329 operation=operation, operation_name=operation_name 8330 ) 8331 elif operation_type == "sql": 8332 self.calculation_process_sql( 8333 operation=operation, operation_name=operation_name 8334 ) 8335 else: 8336 log.error( 8337 f"Operations config: Type '{operation_type}' NOT available" 8338 ) 8339 raise ValueError( 8340 f"Operations config: Type '{operation_type}' NOT available" 8341 ) 8342 else: 8343 log.error( 8344 f"Operations config: Calculation '{operation_name}' NOT available" 8345 ) 8346 raise ValueError( 8347 f"Operations config: Calculation '{operation_name}' NOT available" 8348 ) 8349 8350 # Explode INFOS fields into table fields 8351 if self.get_explode_infos(): 8352 self.explode_infos( 8353 prefix=self.get_explode_infos_prefix(), 8354 fields=self.get_explode_infos_fields(), 8355 force=True, 8356 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8358 def calculation_process_sql( 8359 self, operation: dict, operation_name: str = "unknown" 8360 ) -> None: 8361 """ 8362 The `calculation_process_sql` function takes in a mathematical operation as a string and 8363 performs the operation, updating the specified table with the result. 8364 8365 :param operation: The `operation` parameter is a dictionary that contains information about the 8366 mathematical operation to be performed. It includes the following keys: 8367 :type operation: dict 8368 :param operation_name: The `operation_name` parameter is a string that represents the name of 8369 the mathematical operation being performed. It is used for logging and error handling purposes, 8370 defaults to unknown 8371 :type operation_name: str (optional) 8372 """ 8373 8374 # Operation infos 8375 operation_name = operation.get("name", "unknown") 8376 log.debug(f"process SQL {operation_name}") 8377 output_column_name = operation.get("output_column_name", operation_name) 8378 output_column_type = operation.get("output_column_type", "String") 8379 prefix = operation.get("explode_infos_prefix", "") 8380 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8381 output_column_description = operation.get( 8382 "output_column_description", f"{operation_name} operation" 8383 ) 8384 operation_query = operation.get("operation_query", None) 8385 if isinstance(operation_query, list): 8386 operation_query = " ".join(operation_query) 8387 operation_info_fields = operation.get("info_fields", []) 8388 operation_info_fields_check = operation.get("info_fields_check", False) 8389 operation_info = operation.get("operation_info", True) 8390 operation_table = operation.get( 8391 "table", self.get_table_variants(clause="alter") 8392 ) 8393 8394 # table variants 8395 if operation_table: 8396 table_variants = operation_table 8397 else: 8398 table_variants = self.get_table_variants(clause="alter") 8399 8400 if operation_query: 8401 8402 # Info fields check 8403 operation_info_fields_check_result = True 8404 if operation_info_fields_check: 8405 header_infos = self.get_header().infos 8406 for info_field in operation_info_fields: 8407 operation_info_fields_check_result = ( 8408 operation_info_fields_check_result 8409 and info_field in header_infos 8410 ) 8411 8412 # If info fields available 8413 if operation_info_fields_check_result: 8414 8415 # Added_columns 8416 added_columns = [] 8417 8418 # Create VCF header field 8419 vcf_reader = self.get_header() 8420 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8421 output_column_name, 8422 ".", 8423 output_column_type, 8424 output_column_description, 8425 "howard calculation", 8426 "0", 8427 self.code_type_map.get(output_column_type), 8428 ) 8429 8430 # Explode infos if needed 8431 log.debug(f"calculation_process_sql prefix {prefix}") 8432 added_columns += self.explode_infos( 8433 prefix=prefix, 8434 fields=[output_column_name] + operation_info_fields, 8435 force=False, 8436 table=table_variants, 8437 ) 8438 8439 # Create column 8440 added_column = self.add_column( 8441 table_name=table_variants, 8442 column_name=prefix + output_column_name, 8443 column_type=output_column_type_sql, 8444 default_value="null", 8445 ) 8446 added_columns.append(added_column) 8447 8448 # Operation calculation 8449 try: 8450 8451 # Query to update calculation column 8452 sql_update = f""" 8453 UPDATE {table_variants} 8454 SET "{prefix}{output_column_name}" = ({operation_query}) 8455 """ 8456 self.conn.execute(sql_update) 8457 8458 # Add to INFO 8459 if operation_info: 8460 sql_update_info = f""" 8461 UPDATE {table_variants} 8462 SET "INFO" = 8463 concat( 8464 CASE 8465 WHEN "INFO" IS NOT NULL 8466 THEN concat("INFO", ';') 8467 ELSE '' 8468 END, 8469 '{output_column_name}=', 8470 "{prefix}{output_column_name}" 8471 ) 8472 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8473 """ 8474 self.conn.execute(sql_update_info) 8475 8476 except: 8477 log.error( 8478 f"Operations config: Calculation '{operation_name}' query failed" 8479 ) 8480 raise ValueError( 8481 f"Operations config: Calculation '{operation_name}' query failed" 8482 ) 8483 8484 # Remove added columns 8485 for added_column in added_columns: 8486 log.debug(f"added_column: {added_column}") 8487 self.drop_column(column=added_column) 8488 8489 else: 8490 log.error( 8491 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8492 ) 8493 raise ValueError( 8494 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8495 ) 8496 8497 else: 8498 log.error( 8499 f"Operations config: Calculation '{operation_name}' query NOT defined" 8500 ) 8501 raise ValueError( 8502 f"Operations config: Calculation '{operation_name}' query NOT defined" 8503 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8505 def calculation_process_function( 8506 self, operation: dict, operation_name: str = "unknown" 8507 ) -> None: 8508 """ 8509 The `calculation_process_function` takes in an operation dictionary and performs the specified 8510 function with the given parameters. 8511 8512 :param operation: The `operation` parameter is a dictionary that contains information about the 8513 operation to be performed. It has the following keys: 8514 :type operation: dict 8515 :param operation_name: The `operation_name` parameter is a string that represents the name of 8516 the operation being performed. It is used for logging purposes, defaults to unknown 8517 :type operation_name: str (optional) 8518 """ 8519 8520 operation_name = operation["name"] 8521 log.debug(f"process Python {operation_name}") 8522 function_name = operation["function_name"] 8523 function_params = operation["function_params"] 8524 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8526 def calculation_variant_id(self) -> None: 8527 """ 8528 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8529 updates the INFO field of a variants table with the variant ID. 8530 """ 8531 8532 # variant_id annotation field 8533 variant_id_tag = self.get_variant_id_column() 8534 added_columns = [variant_id_tag] 8535 8536 # variant_id hgvs tags" 8537 vcf_infos_tags = { 8538 variant_id_tag: "howard variant ID annotation", 8539 } 8540 8541 # Variants table 8542 table_variants = self.get_table_variants() 8543 8544 # Header 8545 vcf_reader = self.get_header() 8546 8547 # Add variant_id to header 8548 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8549 variant_id_tag, 8550 ".", 8551 "String", 8552 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8553 "howard calculation", 8554 "0", 8555 self.code_type_map.get("String"), 8556 ) 8557 8558 # Update 8559 sql_update = f""" 8560 UPDATE {table_variants} 8561 SET "INFO" = 8562 concat( 8563 CASE 8564 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8565 THEN '' 8566 ELSE concat("INFO", ';') 8567 END, 8568 '{variant_id_tag}=', 8569 "{variant_id_tag}" 8570 ) 8571 """ 8572 self.conn.execute(sql_update) 8573 8574 # Remove added columns 8575 for added_column in added_columns: 8576 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8578 def calculation_extract_snpeff_hgvs( 8579 self, 8580 snpeff_hgvs: str = "snpeff_hgvs", 8581 snpeff_field: str = "ANN", 8582 ) -> None: 8583 """ 8584 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8585 annotation field in a VCF file and adds them as a new column in the variants table. 8586 8587 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8588 function is used to specify the name of the column that will store the HGVS nomenclatures 8589 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8590 snpeff_hgvs 8591 :type snpeff_hgvs: str (optional) 8592 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8593 function represents the field in the VCF file that contains SnpEff annotations. This field is 8594 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8595 to ANN 8596 :type snpeff_field: str (optional) 8597 """ 8598 8599 # Snpeff hgvs tags 8600 vcf_infos_tags = { 8601 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8602 } 8603 8604 # Prefix 8605 prefix = self.get_explode_infos_prefix() 8606 if prefix: 8607 prefix = "INFO/" 8608 8609 # snpEff fields 8610 speff_ann_infos = prefix + snpeff_field 8611 speff_hgvs_infos = prefix + snpeff_hgvs 8612 8613 # Variants table 8614 table_variants = self.get_table_variants() 8615 8616 # Header 8617 vcf_reader = self.get_header() 8618 8619 # Add columns 8620 added_columns = [] 8621 8622 # Explode HGVS field in column 8623 added_columns += self.explode_infos(fields=[snpeff_field]) 8624 8625 if snpeff_field in vcf_reader.infos: 8626 8627 log.debug(vcf_reader.infos[snpeff_field]) 8628 8629 # Extract ANN header 8630 ann_description = vcf_reader.infos[snpeff_field].desc 8631 pattern = r"'(.+?)'" 8632 match = re.search(pattern, ann_description) 8633 if match: 8634 ann_header_match = match.group(1).split(" | ") 8635 ann_header_desc = {} 8636 for i in range(len(ann_header_match)): 8637 ann_header_info = "".join( 8638 char for char in ann_header_match[i] if char.isalnum() 8639 ) 8640 ann_header_desc[ann_header_info] = ann_header_match[i] 8641 if not ann_header_desc: 8642 raise ValueError("Invalid header description format") 8643 else: 8644 raise ValueError("Invalid header description format") 8645 8646 # Create variant id 8647 variant_id_column = self.get_variant_id_column() 8648 added_columns += [variant_id_column] 8649 8650 # Create dataframe 8651 dataframe_snpeff_hgvs = self.get_query_to_df( 8652 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8653 ) 8654 8655 # Create main NOMEN column 8656 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8657 speff_ann_infos 8658 ].apply( 8659 lambda x: extract_snpeff_hgvs( 8660 str(x), header=list(ann_header_desc.values()) 8661 ) 8662 ) 8663 8664 # Add snpeff_hgvs to header 8665 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8666 snpeff_hgvs, 8667 ".", 8668 "String", 8669 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8670 "howard calculation", 8671 "0", 8672 self.code_type_map.get("String"), 8673 ) 8674 8675 # Update 8676 sql_update = f""" 8677 UPDATE variants 8678 SET "INFO" = 8679 concat( 8680 CASE 8681 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8682 THEN '' 8683 ELSE concat("INFO", ';') 8684 END, 8685 CASE 8686 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8687 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8688 THEN concat( 8689 '{snpeff_hgvs}=', 8690 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8691 ) 8692 ELSE '' 8693 END 8694 ) 8695 FROM dataframe_snpeff_hgvs 8696 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8697 8698 """ 8699 self.conn.execute(sql_update) 8700 8701 # Delete dataframe 8702 del dataframe_snpeff_hgvs 8703 gc.collect() 8704 8705 else: 8706 8707 log.warning( 8708 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8709 ) 8710 8711 # Remove added columns 8712 for added_column in added_columns: 8713 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8715 def calculation_snpeff_ann_explode( 8716 self, 8717 uniquify: bool = True, 8718 output_format: str = "fields", 8719 output_prefix: str = "snpeff_", 8720 snpeff_field: str = "ANN", 8721 ) -> None: 8722 """ 8723 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8724 exploding the HGVS field and updating variant information accordingly. 8725 8726 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8727 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8728 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8729 defaults to True 8730 :type uniquify: bool (optional) 8731 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8732 function specifies the format in which the output annotations will be generated. It has a 8733 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8734 format, defaults to fields 8735 :type output_format: str (optional) 8736 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8737 method is used to specify the prefix that will be added to the output annotations generated 8738 during the calculation process. This prefix helps to differentiate the newly added annotations 8739 from existing ones in the output data. By default, the, defaults to ANN_ 8740 :type output_prefix: str (optional) 8741 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8742 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8743 field will be processed to explode the HGVS annotations and update the variant information 8744 accordingly, defaults to ANN 8745 :type snpeff_field: str (optional) 8746 """ 8747 8748 # SnpEff annotation field 8749 snpeff_hgvs = "snpeff_ann_explode" 8750 8751 # Snpeff hgvs tags 8752 vcf_infos_tags = { 8753 snpeff_hgvs: "Explode snpEff annotations", 8754 } 8755 8756 # Prefix 8757 prefix = self.get_explode_infos_prefix() 8758 if prefix: 8759 prefix = "INFO/" 8760 8761 # snpEff fields 8762 speff_ann_infos = prefix + snpeff_field 8763 speff_hgvs_infos = prefix + snpeff_hgvs 8764 8765 # Variants table 8766 table_variants = self.get_table_variants() 8767 8768 # Header 8769 vcf_reader = self.get_header() 8770 8771 # Add columns 8772 added_columns = [] 8773 8774 # Explode HGVS field in column 8775 added_columns += self.explode_infos(fields=[snpeff_field]) 8776 log.debug(f"snpeff_field={snpeff_field}") 8777 log.debug(f"added_columns={added_columns}") 8778 8779 if snpeff_field in vcf_reader.infos: 8780 8781 # Extract ANN header 8782 ann_description = vcf_reader.infos[snpeff_field].desc 8783 pattern = r"'(.+?)'" 8784 match = re.search(pattern, ann_description) 8785 if match: 8786 ann_header_match = match.group(1).split(" | ") 8787 ann_header = [] 8788 ann_header_desc = {} 8789 for i in range(len(ann_header_match)): 8790 ann_header_info = "".join( 8791 char for char in ann_header_match[i] if char.isalnum() 8792 ) 8793 ann_header.append(ann_header_info) 8794 ann_header_desc[ann_header_info] = ann_header_match[i] 8795 if not ann_header_desc: 8796 raise ValueError("Invalid header description format") 8797 else: 8798 raise ValueError("Invalid header description format") 8799 8800 # Create variant id 8801 variant_id_column = self.get_variant_id_column() 8802 added_columns += [variant_id_column] 8803 8804 # Create dataframe 8805 dataframe_snpeff_hgvs = self.get_query_to_df( 8806 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8807 ) 8808 8809 # Create snpEff columns 8810 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8811 speff_ann_infos 8812 ].apply( 8813 lambda x: explode_snpeff_ann( 8814 str(x), 8815 uniquify=uniquify, 8816 output_format=output_format, 8817 prefix=output_prefix, 8818 header=list(ann_header_desc.values()), 8819 ) 8820 ) 8821 8822 # Header 8823 ann_annotations_prefix = "" 8824 if output_format.upper() in ["JSON"]: 8825 ann_annotations_prefix = f"{output_prefix}=" 8826 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8827 output_prefix, 8828 ".", 8829 "String", 8830 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8831 + " - JSON format", 8832 "howard calculation", 8833 "0", 8834 self.code_type_map.get("String"), 8835 ) 8836 else: 8837 for ann_annotation in ann_header: 8838 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8839 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8840 ann_annotation_id, 8841 ".", 8842 "String", 8843 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8844 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8845 "howard calculation", 8846 "0", 8847 self.code_type_map.get("String"), 8848 ) 8849 8850 # Update 8851 sql_update = f""" 8852 UPDATE variants 8853 SET "INFO" = 8854 concat( 8855 CASE 8856 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8857 THEN '' 8858 ELSE concat("INFO", ';') 8859 END, 8860 CASE 8861 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8862 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8863 THEN concat( 8864 '{ann_annotations_prefix}', 8865 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8866 ) 8867 ELSE '' 8868 END 8869 ) 8870 FROM dataframe_snpeff_hgvs 8871 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8872 8873 """ 8874 self.conn.execute(sql_update) 8875 8876 # Delete dataframe 8877 del dataframe_snpeff_hgvs 8878 gc.collect() 8879 8880 else: 8881 8882 log.warning( 8883 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8884 ) 8885 8886 # Remove added columns 8887 for added_column in added_columns: 8888 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8890 def calculation_extract_nomen(self) -> None: 8891 """ 8892 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8893 """ 8894 8895 # NOMEN field 8896 field_nomen_dict = "NOMEN_DICT" 8897 8898 # NOMEN structure 8899 nomen_dict = { 8900 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8901 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8902 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8903 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8904 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8905 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8906 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8907 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8908 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8909 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8910 } 8911 8912 # Param 8913 param = self.get_param() 8914 8915 # Prefix 8916 prefix = self.get_explode_infos_prefix() 8917 8918 # Header 8919 vcf_reader = self.get_header() 8920 8921 # Added columns 8922 added_columns = [] 8923 8924 # Get HGVS field 8925 hgvs_field = ( 8926 param.get("calculation", {}) 8927 .get("calculations", {}) 8928 .get("NOMEN", {}) 8929 .get("options", {}) 8930 .get("hgvs_field", "hgvs") 8931 ) 8932 8933 # Get NOMEN pattern 8934 nomen_pattern = ( 8935 param.get("calculation", {}) 8936 .get("calculations", {}) 8937 .get("NOMEN", {}) 8938 .get("options", {}) 8939 .get("pattern", None) 8940 ) 8941 8942 # transcripts list of preference sources 8943 transcripts_sources = {} 8944 8945 # Get transcripts 8946 transcripts_file = ( 8947 param.get("calculation", {}) 8948 .get("calculations", {}) 8949 .get("NOMEN", {}) 8950 .get("options", {}) 8951 .get("transcripts", None) 8952 ) 8953 transcripts_file = full_path(transcripts_file) 8954 if transcripts_file: 8955 if os.path.exists(transcripts_file): 8956 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8957 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8958 transcripts_sources["file"] = transcripts_from_file 8959 else: 8960 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8961 log.error(msg_err) 8962 raise ValueError(msg_err) 8963 8964 # Get transcripts table 8965 transcripts_table = ( 8966 param.get("calculation", {}) 8967 .get("calculations", {}) 8968 .get("NOMEN", {}) 8969 .get("options", {}) 8970 .get("transcripts_table", self.get_table_variants()) 8971 ) 8972 # Get transcripts column 8973 transcripts_column = ( 8974 param.get("calculation", {}) 8975 .get("calculations", {}) 8976 .get("NOMEN", {}) 8977 .get("options", {}) 8978 .get("transcripts_column", None) 8979 ) 8980 8981 if transcripts_table and transcripts_column: 8982 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8983 # Explode if not exists 8984 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8985 else: 8986 extra_field_transcript = f"NULL" 8987 8988 # Transcripts of preference source order 8989 transcripts_order = ( 8990 param.get("calculation", {}) 8991 .get("calculations", {}) 8992 .get("NOMEN", {}) 8993 .get("options", {}) 8994 .get("transcripts_order", ["column", "file"]) 8995 ) 8996 8997 # Transcripts from file 8998 transcripts = transcripts_sources.get("file", []) 8999 9000 # Explode HGVS field in column 9001 added_columns += self.explode_infos(fields=[hgvs_field]) 9002 9003 # extra infos 9004 extra_infos = self.get_extra_infos() 9005 extra_field = prefix + hgvs_field 9006 9007 if extra_field in extra_infos: 9008 9009 # Create dataframe 9010 dataframe_hgvs = self.get_query_to_df( 9011 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9012 ) 9013 9014 # Create main NOMEN column 9015 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9016 lambda x: find_nomen( 9017 hgvs=x.hgvs, 9018 transcript=x.transcript, 9019 transcripts=transcripts, 9020 pattern=nomen_pattern, 9021 transcripts_source_order=transcripts_order, 9022 ), 9023 axis=1, 9024 ) 9025 9026 # Explode NOMEN Structure and create SQL set for update 9027 sql_nomen_fields = [] 9028 for nomen_field in nomen_dict: 9029 9030 # Explode each field into a column 9031 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9032 lambda x: dict(x).get(nomen_field, "") 9033 ) 9034 9035 # Create VCF header field 9036 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9037 nomen_field, 9038 ".", 9039 "String", 9040 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9041 "howard calculation", 9042 "0", 9043 self.code_type_map.get("String"), 9044 ) 9045 sql_nomen_fields.append( 9046 f""" 9047 CASE 9048 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9049 THEN concat( 9050 ';{nomen_field}=', 9051 dataframe_hgvs."{nomen_field}" 9052 ) 9053 ELSE '' 9054 END 9055 """ 9056 ) 9057 9058 # SQL set for update 9059 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9060 9061 # Update 9062 sql_update = f""" 9063 UPDATE variants 9064 SET "INFO" = 9065 concat( 9066 CASE 9067 WHEN "INFO" IS NULL 9068 THEN '' 9069 ELSE "INFO" 9070 END, 9071 {sql_nomen_fields_set} 9072 ) 9073 FROM dataframe_hgvs 9074 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9075 AND variants."POS" = dataframe_hgvs."POS" 9076 AND variants."REF" = dataframe_hgvs."REF" 9077 AND variants."ALT" = dataframe_hgvs."ALT" 9078 """ 9079 self.conn.execute(sql_update) 9080 9081 # Delete dataframe 9082 del dataframe_hgvs 9083 gc.collect() 9084 9085 # Remove added columns 9086 for added_column in added_columns: 9087 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9089 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9090 """ 9091 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9092 pipeline/sample for a variant and updates the variant information in a VCF file. 9093 9094 :param tag: The `tag` parameter is a string that represents the annotation field for the 9095 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9096 VCF header and to update the corresponding field in the variants table, defaults to 9097 findbypipeline 9098 :type tag: str (optional) 9099 """ 9100 9101 # if FORMAT and samples 9102 if ( 9103 "FORMAT" in self.get_header_columns_as_list() 9104 and self.get_header_sample_list() 9105 ): 9106 9107 # findbypipeline annotation field 9108 findbypipeline_tag = tag 9109 9110 # VCF infos tags 9111 vcf_infos_tags = { 9112 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9113 } 9114 9115 # Prefix 9116 prefix = self.get_explode_infos_prefix() 9117 9118 # Field 9119 findbypipeline_infos = prefix + findbypipeline_tag 9120 9121 # Variants table 9122 table_variants = self.get_table_variants() 9123 9124 # Header 9125 vcf_reader = self.get_header() 9126 9127 # Create variant id 9128 variant_id_column = self.get_variant_id_column() 9129 added_columns = [variant_id_column] 9130 9131 # variant_id, FORMAT and samples 9132 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9133 self.get_header_sample_list() 9134 ) 9135 9136 # Create dataframe 9137 dataframe_findbypipeline = self.get_query_to_df( 9138 f""" SELECT {samples_fields} FROM {table_variants} """ 9139 ) 9140 9141 # Create findbypipeline column 9142 dataframe_findbypipeline[findbypipeline_infos] = ( 9143 dataframe_findbypipeline.apply( 9144 lambda row: findbypipeline( 9145 row, samples=self.get_header_sample_list() 9146 ), 9147 axis=1, 9148 ) 9149 ) 9150 9151 # Add snpeff_hgvs to header 9152 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9153 findbypipeline_tag, 9154 ".", 9155 "String", 9156 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9157 "howard calculation", 9158 "0", 9159 self.code_type_map.get("String"), 9160 ) 9161 9162 # Update 9163 sql_update = f""" 9164 UPDATE variants 9165 SET "INFO" = 9166 concat( 9167 CASE 9168 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9169 THEN '' 9170 ELSE concat("INFO", ';') 9171 END, 9172 CASE 9173 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9174 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9175 THEN concat( 9176 '{findbypipeline_tag}=', 9177 dataframe_findbypipeline."{findbypipeline_infos}" 9178 ) 9179 ELSE '' 9180 END 9181 ) 9182 FROM dataframe_findbypipeline 9183 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9184 """ 9185 self.conn.execute(sql_update) 9186 9187 # Remove added columns 9188 for added_column in added_columns: 9189 self.drop_column(column=added_column) 9190 9191 # Delete dataframe 9192 del dataframe_findbypipeline 9193 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9195 def calculation_genotype_concordance(self) -> None: 9196 """ 9197 The function `calculation_genotype_concordance` calculates the genotype concordance for 9198 multi-caller VCF files and updates the variant information in the database. 9199 """ 9200 9201 # if FORMAT and samples 9202 if ( 9203 "FORMAT" in self.get_header_columns_as_list() 9204 and self.get_header_sample_list() 9205 ): 9206 9207 # genotypeconcordance annotation field 9208 genotypeconcordance_tag = "genotypeconcordance" 9209 9210 # VCF infos tags 9211 vcf_infos_tags = { 9212 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9213 } 9214 9215 # Prefix 9216 prefix = self.get_explode_infos_prefix() 9217 9218 # Field 9219 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9220 9221 # Variants table 9222 table_variants = self.get_table_variants() 9223 9224 # Header 9225 vcf_reader = self.get_header() 9226 9227 # Create variant id 9228 variant_id_column = self.get_variant_id_column() 9229 added_columns = [variant_id_column] 9230 9231 # variant_id, FORMAT and samples 9232 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9233 self.get_header_sample_list() 9234 ) 9235 9236 # Create dataframe 9237 dataframe_genotypeconcordance = self.get_query_to_df( 9238 f""" SELECT {samples_fields} FROM {table_variants} """ 9239 ) 9240 9241 # Create genotypeconcordance column 9242 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9243 dataframe_genotypeconcordance.apply( 9244 lambda row: genotypeconcordance( 9245 row, samples=self.get_header_sample_list() 9246 ), 9247 axis=1, 9248 ) 9249 ) 9250 9251 # Add genotypeconcordance to header 9252 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9253 genotypeconcordance_tag, 9254 ".", 9255 "String", 9256 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9257 "howard calculation", 9258 "0", 9259 self.code_type_map.get("String"), 9260 ) 9261 9262 # Update 9263 sql_update = f""" 9264 UPDATE variants 9265 SET "INFO" = 9266 concat( 9267 CASE 9268 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9269 THEN '' 9270 ELSE concat("INFO", ';') 9271 END, 9272 CASE 9273 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9274 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9275 THEN concat( 9276 '{genotypeconcordance_tag}=', 9277 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9278 ) 9279 ELSE '' 9280 END 9281 ) 9282 FROM dataframe_genotypeconcordance 9283 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9284 """ 9285 self.conn.execute(sql_update) 9286 9287 # Remove added columns 9288 for added_column in added_columns: 9289 self.drop_column(column=added_column) 9290 9291 # Delete dataframe 9292 del dataframe_genotypeconcordance 9293 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9295 def calculation_barcode(self, tag: str = "barcode") -> None: 9296 """ 9297 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9298 updates the INFO field in the file with the calculated barcode values. 9299 9300 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9301 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9302 the default tag name is set to "barcode", defaults to barcode 9303 :type tag: str (optional) 9304 """ 9305 9306 # if FORMAT and samples 9307 if ( 9308 "FORMAT" in self.get_header_columns_as_list() 9309 and self.get_header_sample_list() 9310 ): 9311 9312 # barcode annotation field 9313 if not tag: 9314 tag = "barcode" 9315 9316 # VCF infos tags 9317 vcf_infos_tags = { 9318 tag: "barcode calculation (VaRank)", 9319 } 9320 9321 # Prefix 9322 prefix = self.get_explode_infos_prefix() 9323 9324 # Field 9325 barcode_infos = prefix + tag 9326 9327 # Variants table 9328 table_variants = self.get_table_variants() 9329 9330 # Header 9331 vcf_reader = self.get_header() 9332 9333 # Create variant id 9334 variant_id_column = self.get_variant_id_column() 9335 added_columns = [variant_id_column] 9336 9337 # variant_id, FORMAT and samples 9338 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9339 self.get_header_sample_list() 9340 ) 9341 9342 # Create dataframe 9343 dataframe_barcode = self.get_query_to_df( 9344 f""" SELECT {samples_fields} FROM {table_variants} """ 9345 ) 9346 9347 # Create barcode column 9348 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9349 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9350 ) 9351 9352 # Add barcode to header 9353 vcf_reader.infos[tag] = vcf.parser._Info( 9354 tag, 9355 ".", 9356 "String", 9357 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9358 "howard calculation", 9359 "0", 9360 self.code_type_map.get("String"), 9361 ) 9362 9363 # Update 9364 sql_update = f""" 9365 UPDATE {table_variants} 9366 SET "INFO" = 9367 concat( 9368 CASE 9369 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9370 THEN '' 9371 ELSE concat("INFO", ';') 9372 END, 9373 CASE 9374 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9375 AND dataframe_barcode."{barcode_infos}" NOT NULL 9376 THEN concat( 9377 '{tag}=', 9378 dataframe_barcode."{barcode_infos}" 9379 ) 9380 ELSE '' 9381 END 9382 ) 9383 FROM dataframe_barcode 9384 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9385 """ 9386 self.conn.execute(sql_update) 9387 9388 # Remove added columns 9389 for added_column in added_columns: 9390 self.drop_column(column=added_column) 9391 9392 # Delete dataframe 9393 del dataframe_barcode 9394 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9396 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9397 """ 9398 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9399 and updates the INFO field in the file with the calculated barcode values. 9400 9401 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9402 the barcode tag that will be added to the VCF file during the calculation process. If no value 9403 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9404 :type tag: str (optional) 9405 """ 9406 9407 # if FORMAT and samples 9408 if ( 9409 "FORMAT" in self.get_header_columns_as_list() 9410 and self.get_header_sample_list() 9411 ): 9412 9413 # barcode annotation field 9414 if not tag: 9415 tag = "BCF" 9416 9417 # VCF infos tags 9418 vcf_infos_tags = { 9419 tag: "barcode family calculation", 9420 f"{tag}S": "barcode family samples", 9421 } 9422 9423 # Param 9424 param = self.get_param() 9425 log.debug(f"param={param}") 9426 9427 # Prefix 9428 prefix = self.get_explode_infos_prefix() 9429 9430 # PED param 9431 ped = ( 9432 param.get("calculation", {}) 9433 .get("calculations", {}) 9434 .get("BARCODEFAMILY", {}) 9435 .get("family_pedigree", None) 9436 ) 9437 log.debug(f"ped={ped}") 9438 9439 # Load PED 9440 if ped: 9441 9442 # Pedigree is a file 9443 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9444 log.debug("Pedigree is file") 9445 with open(full_path(ped)) as ped: 9446 ped = yaml.safe_load(ped) 9447 9448 # Pedigree is a string 9449 elif isinstance(ped, str): 9450 log.debug("Pedigree is str") 9451 try: 9452 ped = json.loads(ped) 9453 log.debug("Pedigree is json str") 9454 except ValueError as e: 9455 ped_samples = ped.split(",") 9456 ped = {} 9457 for ped_sample in ped_samples: 9458 ped[ped_sample] = ped_sample 9459 9460 # Pedigree is a dict 9461 elif isinstance(ped, dict): 9462 log.debug("Pedigree is dict") 9463 9464 # Pedigree is not well formatted 9465 else: 9466 msg_error = "Pedigree not well formatted" 9467 log.error(msg_error) 9468 raise ValueError(msg_error) 9469 9470 # Construct list 9471 ped_samples = list(ped.values()) 9472 9473 else: 9474 log.debug("Pedigree not defined. Take all samples") 9475 ped_samples = self.get_header_sample_list() 9476 ped = {} 9477 for ped_sample in ped_samples: 9478 ped[ped_sample] = ped_sample 9479 9480 # Check pedigree 9481 if not ped or len(ped) == 0: 9482 msg_error = f"Error in pedigree: samples {ped_samples}" 9483 log.error(msg_error) 9484 raise ValueError(msg_error) 9485 9486 # Log 9487 log.info( 9488 "Calculation 'BARCODEFAMILY' - Samples: " 9489 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9490 ) 9491 log.debug(f"ped_samples={ped_samples}") 9492 9493 # Field 9494 barcode_infos = prefix + tag 9495 9496 # Variants table 9497 table_variants = self.get_table_variants() 9498 9499 # Header 9500 vcf_reader = self.get_header() 9501 9502 # Create variant id 9503 variant_id_column = self.get_variant_id_column() 9504 added_columns = [variant_id_column] 9505 9506 # variant_id, FORMAT and samples 9507 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9508 ped_samples 9509 ) 9510 9511 # Create dataframe 9512 dataframe_barcode = self.get_query_to_df( 9513 f""" SELECT {samples_fields} FROM {table_variants} """ 9514 ) 9515 9516 # Create barcode column 9517 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9518 lambda row: barcode(row, samples=ped_samples), axis=1 9519 ) 9520 9521 # Add barcode family to header 9522 # Add vaf_normalization to header 9523 vcf_reader.formats[tag] = vcf.parser._Format( 9524 id=tag, 9525 num=".", 9526 type="String", 9527 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9528 type_code=self.code_type_map.get("String"), 9529 ) 9530 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9531 id=f"{tag}S", 9532 num=".", 9533 type="String", 9534 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9535 type_code=self.code_type_map.get("String"), 9536 ) 9537 9538 # Update 9539 # for sample in ped_samples: 9540 sql_update_set = [] 9541 for sample in self.get_header_sample_list() + ["FORMAT"]: 9542 if sample in ped_samples: 9543 value = f'dataframe_barcode."{barcode_infos}"' 9544 value_samples = "'" + ",".join(ped_samples) + "'" 9545 elif sample == "FORMAT": 9546 value = f"'{tag}'" 9547 value_samples = f"'{tag}S'" 9548 else: 9549 value = "'.'" 9550 value_samples = "'.'" 9551 format_regex = r"[a-zA-Z0-9\s]" 9552 sql_update_set.append( 9553 f""" 9554 "{sample}" = 9555 concat( 9556 CASE 9557 WHEN {table_variants}."{sample}" = './.' 9558 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9559 ELSE {table_variants}."{sample}" 9560 END, 9561 ':', 9562 {value}, 9563 ':', 9564 {value_samples} 9565 ) 9566 """ 9567 ) 9568 9569 sql_update_set_join = ", ".join(sql_update_set) 9570 sql_update = f""" 9571 UPDATE {table_variants} 9572 SET {sql_update_set_join} 9573 FROM dataframe_barcode 9574 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9575 """ 9576 self.conn.execute(sql_update) 9577 9578 # Remove added columns 9579 for added_column in added_columns: 9580 self.drop_column(column=added_column) 9581 9582 # Delete dataframe 9583 del dataframe_barcode 9584 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9586 def calculation_trio(self) -> None: 9587 """ 9588 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9589 information to the INFO field of each variant. 9590 """ 9591 9592 # if FORMAT and samples 9593 if ( 9594 "FORMAT" in self.get_header_columns_as_list() 9595 and self.get_header_sample_list() 9596 ): 9597 9598 # trio annotation field 9599 trio_tag = "trio" 9600 9601 # VCF infos tags 9602 vcf_infos_tags = { 9603 "trio": "trio calculation", 9604 } 9605 9606 # Param 9607 param = self.get_param() 9608 9609 # Prefix 9610 prefix = self.get_explode_infos_prefix() 9611 9612 # Trio param 9613 trio_ped = ( 9614 param.get("calculation", {}) 9615 .get("calculations", {}) 9616 .get("TRIO", {}) 9617 .get("trio_pedigree", None) 9618 ) 9619 9620 # Load trio 9621 if trio_ped: 9622 9623 # Trio pedigree is a file 9624 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9625 log.debug("TRIO pedigree is file") 9626 with open(full_path(trio_ped)) as trio_ped: 9627 trio_ped = yaml.safe_load(trio_ped) 9628 9629 # Trio pedigree is a string 9630 elif isinstance(trio_ped, str): 9631 log.debug("TRIO pedigree is str") 9632 try: 9633 trio_ped = json.loads(trio_ped) 9634 log.debug("TRIO pedigree is json str") 9635 except ValueError as e: 9636 trio_samples = trio_ped.split(",") 9637 if len(trio_samples) == 3: 9638 trio_ped = { 9639 "father": trio_samples[0], 9640 "mother": trio_samples[1], 9641 "child": trio_samples[2], 9642 } 9643 log.debug("TRIO pedigree is list str") 9644 else: 9645 msg_error = "TRIO pedigree not well formatted" 9646 log.error(msg_error) 9647 raise ValueError(msg_error) 9648 9649 # Trio pedigree is a dict 9650 elif isinstance(trio_ped, dict): 9651 log.debug("TRIO pedigree is dict") 9652 9653 # Trio pedigree is not well formatted 9654 else: 9655 msg_error = "TRIO pedigree not well formatted" 9656 log.error(msg_error) 9657 raise ValueError(msg_error) 9658 9659 # Construct trio list 9660 trio_samples = [ 9661 trio_ped.get("father", ""), 9662 trio_ped.get("mother", ""), 9663 trio_ped.get("child", ""), 9664 ] 9665 9666 else: 9667 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9668 samples_list = self.get_header_sample_list() 9669 if len(samples_list) >= 3: 9670 trio_samples = self.get_header_sample_list()[0:3] 9671 trio_ped = { 9672 "father": trio_samples[0], 9673 "mother": trio_samples[1], 9674 "child": trio_samples[2], 9675 } 9676 else: 9677 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9678 log.error(msg_error) 9679 raise ValueError(msg_error) 9680 9681 # Check trio pedigree 9682 if not trio_ped or len(trio_ped) != 3: 9683 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9684 log.error(msg_error) 9685 raise ValueError(msg_error) 9686 9687 # Log 9688 log.info( 9689 f"Calculation 'TRIO' - Samples: " 9690 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9691 ) 9692 9693 # Field 9694 trio_infos = prefix + trio_tag 9695 9696 # Variants table 9697 table_variants = self.get_table_variants() 9698 9699 # Header 9700 vcf_reader = self.get_header() 9701 9702 # Create variant id 9703 variant_id_column = self.get_variant_id_column() 9704 added_columns = [variant_id_column] 9705 9706 # variant_id, FORMAT and samples 9707 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9708 self.get_header_sample_list() 9709 ) 9710 9711 # Create dataframe 9712 dataframe_trio = self.get_query_to_df( 9713 f""" SELECT {samples_fields} FROM {table_variants} """ 9714 ) 9715 9716 # Create trio column 9717 dataframe_trio[trio_infos] = dataframe_trio.apply( 9718 lambda row: trio(row, samples=trio_samples), axis=1 9719 ) 9720 9721 # Add trio to header 9722 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9723 trio_tag, 9724 ".", 9725 "String", 9726 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9727 "howard calculation", 9728 "0", 9729 self.code_type_map.get("String"), 9730 ) 9731 9732 # Update 9733 sql_update = f""" 9734 UPDATE {table_variants} 9735 SET "INFO" = 9736 concat( 9737 CASE 9738 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9739 THEN '' 9740 ELSE concat("INFO", ';') 9741 END, 9742 CASE 9743 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9744 AND dataframe_trio."{trio_infos}" NOT NULL 9745 THEN concat( 9746 '{trio_tag}=', 9747 dataframe_trio."{trio_infos}" 9748 ) 9749 ELSE '' 9750 END 9751 ) 9752 FROM dataframe_trio 9753 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9754 """ 9755 self.conn.execute(sql_update) 9756 9757 # Remove added columns 9758 for added_column in added_columns: 9759 self.drop_column(column=added_column) 9760 9761 # Delete dataframe 9762 del dataframe_trio 9763 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9765 def calculation_vaf_normalization(self) -> None: 9766 """ 9767 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9768 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9769 :return: The function does not return anything. 9770 """ 9771 9772 # if FORMAT and samples 9773 if ( 9774 "FORMAT" in self.get_header_columns_as_list() 9775 and self.get_header_sample_list() 9776 ): 9777 9778 # vaf_normalization annotation field 9779 vaf_normalization_tag = "VAF" 9780 9781 # VCF infos tags 9782 vcf_infos_tags = { 9783 "VAF": "VAF Variant Frequency", 9784 } 9785 9786 # Prefix 9787 prefix = self.get_explode_infos_prefix() 9788 9789 # Variants table 9790 table_variants = self.get_table_variants() 9791 9792 # Header 9793 vcf_reader = self.get_header() 9794 9795 # Do not calculate if VAF already exists 9796 if "VAF" in vcf_reader.formats: 9797 log.debug("VAF already on genotypes") 9798 return 9799 9800 # Create variant id 9801 variant_id_column = self.get_variant_id_column() 9802 added_columns = [variant_id_column] 9803 9804 # variant_id, FORMAT and samples 9805 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9806 f""" "{sample}" """ for sample in self.get_header_sample_list() 9807 ) 9808 9809 # Create dataframe 9810 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9811 log.debug(f"query={query}") 9812 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9813 9814 vaf_normalization_set = [] 9815 9816 # for each sample vaf_normalization 9817 for sample in self.get_header_sample_list(): 9818 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9819 lambda row: vaf_normalization(row, sample=sample), axis=1 9820 ) 9821 vaf_normalization_set.append( 9822 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9823 ) 9824 9825 # Add VAF to FORMAT 9826 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9827 "FORMAT" 9828 ].apply(lambda x: str(x) + ":VAF") 9829 vaf_normalization_set.append( 9830 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9831 ) 9832 9833 # Add vaf_normalization to header 9834 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9835 id=vaf_normalization_tag, 9836 num="1", 9837 type="Float", 9838 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9839 type_code=self.code_type_map.get("Float"), 9840 ) 9841 9842 # Create fields to add in INFO 9843 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9844 9845 # Update 9846 sql_update = f""" 9847 UPDATE {table_variants} 9848 SET {sql_vaf_normalization_set} 9849 FROM dataframe_vaf_normalization 9850 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9851 9852 """ 9853 self.conn.execute(sql_update) 9854 9855 # Remove added columns 9856 for added_column in added_columns: 9857 self.drop_column(column=added_column) 9858 9859 # Delete dataframe 9860 del dataframe_vaf_normalization 9861 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9863 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9864 """ 9865 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9866 field in a VCF file and updates the INFO column of the variants table with the calculated 9867 statistics. 9868 9869 :param info: The `info` parameter is a string that represents the type of information for which 9870 genotype statistics are calculated. It is used to generate various VCF info tags for the 9871 statistics, such as the number of occurrences, the list of values, the minimum value, the 9872 maximum value, the mean, the median, defaults to VAF 9873 :type info: str (optional) 9874 """ 9875 9876 # if FORMAT and samples 9877 if ( 9878 "FORMAT" in self.get_header_columns_as_list() 9879 and self.get_header_sample_list() 9880 ): 9881 9882 # vaf_stats annotation field 9883 vaf_stats_tag = info + "_stats" 9884 9885 # VCF infos tags 9886 vcf_infos_tags = { 9887 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9888 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9889 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9890 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9891 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9892 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9893 info 9894 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9895 } 9896 9897 # Prefix 9898 prefix = self.get_explode_infos_prefix() 9899 9900 # Field 9901 vaf_stats_infos = prefix + vaf_stats_tag 9902 9903 # Variants table 9904 table_variants = self.get_table_variants() 9905 9906 # Header 9907 vcf_reader = self.get_header() 9908 9909 # Create variant id 9910 variant_id_column = self.get_variant_id_column() 9911 added_columns = [variant_id_column] 9912 9913 # variant_id, FORMAT and samples 9914 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9915 self.get_header_sample_list() 9916 ) 9917 9918 # Create dataframe 9919 dataframe_vaf_stats = self.get_query_to_df( 9920 f""" SELECT {samples_fields} FROM {table_variants} """ 9921 ) 9922 9923 # Create vaf_stats column 9924 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9925 lambda row: genotype_stats( 9926 row, samples=self.get_header_sample_list(), info=info 9927 ), 9928 axis=1, 9929 ) 9930 9931 # List of vcf tags 9932 sql_vaf_stats_fields = [] 9933 9934 # Check all VAF stats infos 9935 for stat in vcf_infos_tags: 9936 9937 # Extract stats 9938 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9939 lambda x: dict(x).get(stat, "") 9940 ) 9941 9942 # Add snpeff_hgvs to header 9943 vcf_reader.infos[stat] = vcf.parser._Info( 9944 stat, 9945 ".", 9946 "String", 9947 vcf_infos_tags.get(stat, "genotype statistics"), 9948 "howard calculation", 9949 "0", 9950 self.code_type_map.get("String"), 9951 ) 9952 9953 if len(sql_vaf_stats_fields): 9954 sep = ";" 9955 else: 9956 sep = "" 9957 9958 # Create fields to add in INFO 9959 sql_vaf_stats_fields.append( 9960 f""" 9961 CASE 9962 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9963 THEN concat( 9964 '{sep}{stat}=', 9965 dataframe_vaf_stats."{stat}" 9966 ) 9967 ELSE '' 9968 END 9969 """ 9970 ) 9971 9972 # SQL set for update 9973 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9974 9975 # Update 9976 sql_update = f""" 9977 UPDATE {table_variants} 9978 SET "INFO" = 9979 concat( 9980 CASE 9981 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9982 THEN '' 9983 ELSE concat("INFO", ';') 9984 END, 9985 {sql_vaf_stats_fields_set} 9986 ) 9987 FROM dataframe_vaf_stats 9988 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9989 9990 """ 9991 self.conn.execute(sql_update) 9992 9993 # Remove added columns 9994 for added_column in added_columns: 9995 self.drop_column(column=added_column) 9996 9997 # Delete dataframe 9998 del dataframe_vaf_stats 9999 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
10001 def calculation_transcripts_annotation( 10002 self, info_json: str = None, info_format: str = None 10003 ) -> None: 10004 """ 10005 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10006 field to it if transcripts are available. 10007 10008 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10009 is a string parameter that represents the information field to be used in the transcripts JSON. 10010 It is used to specify the JSON format for the transcripts information. If no value is provided 10011 when calling the method, it defaults to " 10012 :type info_json: str 10013 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10014 method is a string parameter that specifies the format of the information field to be used in 10015 the transcripts JSON. It is used to define the format of the information field 10016 :type info_format: str 10017 """ 10018 10019 # Create transcripts table 10020 transcripts_table = self.create_transcript_view() 10021 10022 # Add info field 10023 if transcripts_table: 10024 self.transcript_view_to_variants( 10025 transcripts_table=transcripts_table, 10026 transcripts_info_field_json=info_json, 10027 transcripts_info_field_format=info_format, 10028 ) 10029 else: 10030 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10032 def calculation_transcripts_prioritization(self) -> None: 10033 """ 10034 The function `calculation_transcripts_prioritization` creates a transcripts table and 10035 prioritizes transcripts based on certain criteria. 10036 """ 10037 10038 # Create transcripts table 10039 transcripts_table = self.create_transcript_view() 10040 10041 # Add info field 10042 if transcripts_table: 10043 self.transcripts_prioritization(transcripts_table=transcripts_table) 10044 else: 10045 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10047 def calculation_transcripts_export(self) -> None: 10048 """ """ 10049 10050 # Create transcripts table 10051 transcripts_table = self.create_transcript_view() 10052 10053 # Add info field 10054 if transcripts_table: 10055 self.transcripts_export(transcripts_table=transcripts_table) 10056 else: 10057 log.info("No Transcripts to process. Check param.json file configuration")
10063 def transcripts_export( 10064 self, transcripts_table: str = None, param: dict = {} 10065 ) -> bool: 10066 """ """ 10067 10068 log.debug("Start transcripts export...") 10069 10070 # Param 10071 if not param: 10072 param = self.get_param() 10073 10074 # Param export 10075 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10076 10077 # Output file 10078 transcripts_export_output = param_transcript_export.get("output", None) 10079 10080 if not param_transcript_export or not transcripts_export_output: 10081 log.warning(f"No transcriipts export parameters defined!") 10082 return False 10083 10084 # List of transcripts annotations 10085 query_describe = f""" 10086 SELECT column_name 10087 FROM ( 10088 DESCRIBE SELECT * FROM {transcripts_table} 10089 ) 10090 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10091 """ 10092 transcripts_annotations_list = list( 10093 self.get_query_to_df(query=query_describe)["column_name"] 10094 ) 10095 10096 # Create transcripts table for export 10097 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10098 random.choices(string.ascii_uppercase + string.digits, k=10) 10099 ) 10100 query_create_transcripts_table_export = f""" 10101 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10102 """ 10103 self.execute_query(query=query_create_transcripts_table_export) 10104 10105 # Output file format 10106 transcripts_export_output_format = get_file_format( 10107 filename=transcripts_export_output 10108 ) 10109 10110 # Format VCF - construct INFO 10111 if transcripts_export_output_format in ["vcf"]: 10112 10113 # Construct query update INFO and header 10114 query_update_info = [] 10115 for field in transcripts_annotations_list: 10116 10117 # If field not in header 10118 if field not in self.get_header_infos_list(): 10119 10120 # Add PZ Transcript in header 10121 self.get_header().infos[field] = vcf.parser._Info( 10122 field, 10123 ".", 10124 "String", 10125 f"Annotation '{field}' from transcript view", 10126 "unknown", 10127 "unknown", 10128 0, 10129 ) 10130 10131 # Add field as INFO/tag 10132 query_update_info.append( 10133 f""" 10134 CASE 10135 WHEN "{field}" IS NOT NULL 10136 THEN concat('{field}=', "{field}", ';') 10137 ELSE '' 10138 END 10139 """ 10140 ) 10141 10142 # Query param 10143 query_update_info_value = ( 10144 f""" concat('', {", ".join(query_update_info)}) """ 10145 ) 10146 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10147 10148 else: 10149 10150 # Query param 10151 query_update_info_value = f""" NULL """ 10152 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10153 10154 # Update query INFO column 10155 query_update = f""" 10156 UPDATE {transcripts_table_export} 10157 SET INFO = {query_update_info_value} 10158 10159 """ 10160 self.execute_query(query=query_update) 10161 10162 # Export 10163 self.export_output( 10164 output_file=transcripts_export_output, 10165 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10166 ) 10167 10168 # Drop transcripts export table 10169 query_drop_transcripts_table_export = f""" 10170 DROP TABLE {transcripts_table_export} 10171 """ 10172 self.execute_query(query=query_drop_transcripts_table_export)
10174 def transcripts_prioritization( 10175 self, transcripts_table: str = None, param: dict = {} 10176 ) -> bool: 10177 """ 10178 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10179 and updates the variants table with the prioritized information. 10180 10181 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10182 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10183 This parameter is used to identify the table where the transcripts data is stored for the 10184 prioritization process 10185 :type transcripts_table: str 10186 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10187 that contains various configuration settings for the prioritization process of transcripts. It 10188 is used to customize the behavior of the prioritization algorithm and includes settings such as 10189 the prefix for prioritization fields, default profiles, and other 10190 :type param: dict 10191 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10192 transcripts prioritization process is successfully completed, and `False` if there are any 10193 issues or if no profile is defined for transcripts prioritization. 10194 """ 10195 10196 log.debug("Start transcripts prioritization...") 10197 10198 # Param 10199 if not param: 10200 param = self.get_param() 10201 10202 # Variants table 10203 table_variants = self.get_table_variants() 10204 10205 # Transcripts table 10206 if transcripts_table is None: 10207 transcripts_table = self.create_transcript_view( 10208 transcripts_table="transcripts", param=param 10209 ) 10210 if transcripts_table is None: 10211 msg_err = "No Transcripts table availalble" 10212 log.error(msg_err) 10213 raise ValueError(msg_err) 10214 log.debug(f"transcripts_table={transcripts_table}") 10215 10216 # Get transcripts columns 10217 columns_as_list_query = f""" 10218 DESCRIBE {transcripts_table} 10219 """ 10220 columns_as_list = list( 10221 self.get_query_to_df(columns_as_list_query)["column_name"] 10222 ) 10223 10224 # Create INFO if not exists 10225 if "INFO" not in columns_as_list: 10226 query_add_info = f""" 10227 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10228 """ 10229 self.execute_query(query_add_info) 10230 10231 # Prioritization param and Force only PZ Score and Flag 10232 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10233 10234 # PZ profile by default 10235 pz_profile_default = ( 10236 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10237 ) 10238 10239 # Exit if no profile 10240 if pz_profile_default is None: 10241 log.warning("No profile defined for transcripts prioritization") 10242 return False 10243 10244 # PZ fields 10245 pz_param_pzfields = {} 10246 10247 # PZ field transcripts 10248 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10249 10250 # Add PZ Transcript in header 10251 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10252 pz_fields_transcripts, 10253 ".", 10254 "String", 10255 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10256 "unknown", 10257 "unknown", 10258 code_type_map["String"], 10259 ) 10260 10261 # Mandatory fields 10262 pz_mandatory_fields_list = [ 10263 "Score", 10264 "Flag", 10265 "Tags", 10266 "Comment", 10267 "Infos", 10268 "Class", 10269 ] 10270 pz_mandatory_fields = [] 10271 for pz_mandatory_field in pz_mandatory_fields_list: 10272 pz_mandatory_fields.append( 10273 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10274 ) 10275 10276 # PZ fields in param 10277 for pz_field in pz_param.get("pzfields", []): 10278 if pz_field in pz_mandatory_fields_list: 10279 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10280 pz_param.get("pzprefix", "PTZ") + pz_field 10281 ) 10282 else: 10283 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10284 pz_param_pzfields[pz_field] = pz_field_new 10285 10286 # Add PZ Transcript in header 10287 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10288 pz_field_new, 10289 ".", 10290 "String", 10291 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10292 "unknown", 10293 "unknown", 10294 code_type_map["String"], 10295 ) 10296 10297 # PZ fields param 10298 pz_param["pzfields"] = pz_mandatory_fields 10299 10300 # Prioritization 10301 prioritization_result = self.prioritization( 10302 table=transcripts_table, 10303 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10304 ) 10305 if not prioritization_result: 10306 log.warning("Transcripts prioritization not processed") 10307 return False 10308 10309 # PZ fields sql query 10310 query_update_select_list = [] 10311 query_update_concat_list = [] 10312 query_update_order_list = [] 10313 for pz_param_pzfield in set( 10314 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10315 ): 10316 query_update_select_list.append(f" {pz_param_pzfield}, ") 10317 10318 for pz_param_pzfield in pz_param_pzfields: 10319 query_update_concat_list.append( 10320 f""" 10321 , CASE 10322 WHEN {pz_param_pzfield} IS NOT NULL 10323 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10324 ELSE '' 10325 END 10326 """ 10327 ) 10328 10329 # Order by 10330 pz_orders = ( 10331 param.get("transcripts", {}) 10332 .get("prioritization", {}) 10333 .get("prioritization_transcripts_order", {}) 10334 ) 10335 if not pz_orders: 10336 pz_orders = { 10337 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10338 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10339 } 10340 for pz_order in pz_orders: 10341 query_update_order_list.append( 10342 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10343 ) 10344 10345 # Fields to explode 10346 fields_to_explode = ( 10347 list(pz_param_pzfields.keys()) 10348 + pz_mandatory_fields 10349 + list(pz_orders.keys()) 10350 ) 10351 # Remove transcript column as a specific transcript column 10352 if "transcript" in fields_to_explode: 10353 fields_to_explode.remove("transcript") 10354 10355 # Fields intranscripts table 10356 query_transcripts_table = f""" 10357 DESCRIBE SELECT * FROM {transcripts_table} 10358 """ 10359 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10360 10361 # Check fields to explode 10362 for field_to_explode in fields_to_explode: 10363 if field_to_explode not in self.get_header_infos_list() + list( 10364 query_transcripts_table.column_name 10365 ): 10366 msg_err = f"INFO/{field_to_explode} NOT IN header" 10367 log.error(msg_err) 10368 raise ValueError(msg_err) 10369 10370 # Explode fields to explode 10371 self.explode_infos( 10372 table=transcripts_table, 10373 fields=fields_to_explode, 10374 ) 10375 10376 # Transcript preference file 10377 transcripts_preference_file = ( 10378 param.get("transcripts", {}) 10379 .get("prioritization", {}) 10380 .get("prioritization_transcripts", {}) 10381 ) 10382 transcripts_preference_file = full_path(transcripts_preference_file) 10383 10384 # Transcript preference forced 10385 transcript_preference_force = ( 10386 param.get("transcripts", {}) 10387 .get("prioritization", {}) 10388 .get("prioritization_transcripts_force", False) 10389 ) 10390 # Transcript version forced 10391 transcript_version_force = ( 10392 param.get("transcripts", {}) 10393 .get("prioritization", {}) 10394 .get("prioritization_transcripts_version_force", False) 10395 ) 10396 10397 # Transcripts Ranking 10398 if transcripts_preference_file: 10399 10400 # Transcripts file to dataframe 10401 if os.path.exists(transcripts_preference_file): 10402 transcripts_preference_dataframe = transcripts_file_to_df( 10403 transcripts_preference_file 10404 ) 10405 else: 10406 log.error( 10407 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10408 ) 10409 raise ValueError( 10410 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10411 ) 10412 10413 # Order by depending to transcript preference forcing 10414 if transcript_preference_force: 10415 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10416 else: 10417 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10418 10419 # Transcript columns joined depend on version consideration 10420 if transcript_version_force: 10421 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10422 else: 10423 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10424 10425 # Query ranking for update 10426 query_update_ranking = f""" 10427 SELECT 10428 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10429 ROW_NUMBER() OVER ( 10430 PARTITION BY "#CHROM", POS, REF, ALT 10431 ORDER BY {order_by} 10432 ) AS rn 10433 FROM {transcripts_table} 10434 LEFT JOIN 10435 ( 10436 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10437 FROM transcripts_preference_dataframe 10438 ) AS transcripts_preference 10439 ON {transcripts_version_join} 10440 """ 10441 10442 else: 10443 10444 # Query ranking for update 10445 query_update_ranking = f""" 10446 SELECT 10447 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10448 ROW_NUMBER() OVER ( 10449 PARTITION BY "#CHROM", POS, REF, ALT 10450 ORDER BY {" , ".join(query_update_order_list)} 10451 ) AS rn 10452 FROM {transcripts_table} 10453 """ 10454 10455 # Export Transcripts prioritization infos to variants table 10456 query_update = f""" 10457 WITH RankedTranscripts AS ( 10458 {query_update_ranking} 10459 ) 10460 UPDATE {table_variants} 10461 SET 10462 INFO = CONCAT(CASE 10463 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10464 THEN '' 10465 ELSE concat("INFO", ';') 10466 END, 10467 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10468 ) 10469 FROM 10470 RankedTranscripts 10471 WHERE 10472 rn = 1 10473 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10474 AND variants."POS" = RankedTranscripts."POS" 10475 AND variants."REF" = RankedTranscripts."REF" 10476 AND variants."ALT" = RankedTranscripts."ALT" 10477 """ 10478 10479 # log.debug(f"query_update={query_update}") 10480 self.execute_query(query=query_update) 10481 10482 # Return 10483 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10485 def create_transcript_view_from_columns_map( 10486 self, 10487 transcripts_table: str = "transcripts", 10488 columns_maps: dict = {}, 10489 added_columns: list = [], 10490 temporary_tables: list = None, 10491 annotation_fields: list = None, 10492 column_rename: dict = {}, 10493 column_clean: bool = False, 10494 column_case: str = None, 10495 ) -> tuple[list, list, list]: 10496 """ 10497 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10498 specified columns mapping for transcripts data. 10499 10500 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10501 of the table where the transcripts data is stored or will be stored in the database. This table 10502 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10503 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10504 :type transcripts_table: str (optional) 10505 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10506 about how to map columns from a transcripts table to create a view. Each entry in the 10507 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10508 typically includes details such as the main transcript column and additional information columns 10509 :type columns_maps: dict 10510 :param added_columns: The `added_columns` parameter in the 10511 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10512 that will be added to the view being created based on the columns map provided. These columns 10513 are generated by exploding the transcript information columns along with the main transcript 10514 column 10515 :type added_columns: list 10516 :param temporary_tables: The `temporary_tables` parameter in the 10517 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10518 tables created during the process of creating a transcript view from a columns map. These 10519 temporary tables are used to store intermediate results or transformations before the final view 10520 is generated 10521 :type temporary_tables: list 10522 :param annotation_fields: The `annotation_fields` parameter in the 10523 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10524 used for annotation in the query view creation process. These fields are extracted from the 10525 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10526 :type annotation_fields: list 10527 :param column_rename: The `column_rename` parameter in the 10528 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10529 custom renaming for columns during the creation of the temporary table view. This parameter 10530 provides a mapping of original column names to the desired renamed column names. By using this 10531 parameter, 10532 :type column_rename: dict 10533 :param column_clean: The `column_clean` parameter in the 10534 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10535 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10536 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10537 False 10538 :type column_clean: bool (optional) 10539 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10540 function is used to specify the case transformation to be applied to the columns during the view 10541 creation process. It allows you to control whether the column values should be converted to 10542 lowercase, uppercase, or remain unchanged 10543 :type column_case: str 10544 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10545 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10546 """ 10547 10548 log.debug("Start transcrpts view creation from columns map...") 10549 10550 # "from_columns_map": [ 10551 # { 10552 # "transcripts_column": "Ensembl_transcriptid", 10553 # "transcripts_infos_columns": [ 10554 # "genename", 10555 # "Ensembl_geneid", 10556 # "LIST_S2_score", 10557 # "LIST_S2_pred", 10558 # ], 10559 # }, 10560 # { 10561 # "transcripts_column": "Ensembl_transcriptid", 10562 # "transcripts_infos_columns": [ 10563 # "genename", 10564 # "VARITY_R_score", 10565 # "Aloft_pred", 10566 # ], 10567 # }, 10568 # ], 10569 10570 # Init 10571 if temporary_tables is None: 10572 temporary_tables = [] 10573 if annotation_fields is None: 10574 annotation_fields = [] 10575 10576 # Variants table 10577 table_variants = self.get_table_variants() 10578 10579 for columns_map in columns_maps: 10580 10581 # Transcript column 10582 transcripts_column = columns_map.get("transcripts_column", None) 10583 10584 # Transcripts infos columns 10585 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10586 10587 # Transcripts infos columns rename 10588 column_rename = columns_map.get("column_rename", column_rename) 10589 10590 # Transcripts infos columns clean 10591 column_clean = columns_map.get("column_clean", column_clean) 10592 10593 # Transcripts infos columns case 10594 column_case = columns_map.get("column_case", column_case) 10595 10596 if transcripts_column is not None: 10597 10598 # Explode 10599 added_columns += self.explode_infos( 10600 fields=[transcripts_column] + transcripts_infos_columns 10601 ) 10602 10603 # View clauses 10604 clause_select_variants = [] 10605 clause_select_tanscripts = [] 10606 for field in [transcripts_column] + transcripts_infos_columns: 10607 10608 # AS field 10609 as_field = field 10610 10611 # Rename 10612 if column_rename: 10613 as_field = column_rename.get(as_field, as_field) 10614 10615 # Clean 10616 if column_clean: 10617 as_field = clean_annotation_field(as_field) 10618 10619 # Case 10620 if column_case: 10621 if column_case.lower() in ["lower"]: 10622 as_field = as_field.lower() 10623 elif column_case.lower() in ["upper"]: 10624 as_field = as_field.upper() 10625 10626 # Clause select Variants 10627 clause_select_variants.append( 10628 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10629 ) 10630 10631 if field in [transcripts_column]: 10632 clause_select_tanscripts.append( 10633 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10634 ) 10635 else: 10636 clause_select_tanscripts.append( 10637 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10638 ) 10639 annotation_fields.append(as_field) 10640 10641 # Querey View 10642 query = f""" 10643 SELECT 10644 "#CHROM", POS, REF, ALT, INFO, 10645 "{transcripts_column}" AS 'transcript', 10646 {", ".join(clause_select_tanscripts)} 10647 FROM ( 10648 SELECT 10649 "#CHROM", POS, REF, ALT, INFO, 10650 {", ".join(clause_select_variants)} 10651 FROM {table_variants} 10652 ) 10653 WHERE "{transcripts_column}" IS NOT NULL 10654 """ 10655 10656 # Create temporary table 10657 temporary_table = transcripts_table + "".join( 10658 random.choices(string.ascii_uppercase + string.digits, k=10) 10659 ) 10660 10661 # Temporary_tables 10662 temporary_tables.append(temporary_table) 10663 query_view = f""" 10664 CREATE TEMPORARY TABLE {temporary_table} 10665 AS ({query}) 10666 """ 10667 self.execute_query(query=query_view) 10668 10669 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10671 def create_transcript_view_from_column_format( 10672 self, 10673 transcripts_table: str = "transcripts", 10674 column_formats: dict = {}, 10675 temporary_tables: list = None, 10676 annotation_fields: list = None, 10677 column_rename: dict = {}, 10678 column_clean: bool = False, 10679 column_case: str = None, 10680 ) -> tuple[list, list, list]: 10681 """ 10682 The `create_transcript_view_from_column_format` function generates a transcript view based on 10683 specified column formats, adds additional columns and annotation fields, and returns the list of 10684 temporary tables and annotation fields. 10685 10686 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10687 of the table containing the transcripts data. This table will be used as the base table for 10688 creating the transcript view. The default value for this parameter is "transcripts", but you can 10689 provide a different table name if needed, defaults to transcripts 10690 :type transcripts_table: str (optional) 10691 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10692 about the columns to be used for creating the transcript view. Each entry in the dictionary 10693 specifies the mapping between a transcripts column and a transcripts infos column. This 10694 parameter allows you to define how the columns from the transcripts table should be transformed 10695 or mapped 10696 :type column_formats: dict 10697 :param temporary_tables: The `temporary_tables` parameter in the 10698 `create_transcript_view_from_column_format` function is a list that stores the names of 10699 temporary views created during the process of creating a transcript view from a column format. 10700 These temporary views are used to manipulate and extract data before generating the final 10701 transcript view 10702 :type temporary_tables: list 10703 :param annotation_fields: The `annotation_fields` parameter in the 10704 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10705 that are extracted from the temporary views created during the process. These annotation fields 10706 are obtained by querying the temporary views and extracting the column names excluding specific 10707 columns like `#CH 10708 :type annotation_fields: list 10709 :param column_rename: The `column_rename` parameter in the 10710 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10711 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10712 column names to new column names in this dictionary, you can rename specific columns during the 10713 process 10714 :type column_rename: dict 10715 :param column_clean: The `column_clean` parameter in the 10716 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10717 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10718 will be cleaned during the creation of the transcript view based on the specified column format, 10719 defaults to False 10720 :type column_clean: bool (optional) 10721 :param column_case: The `column_case` parameter in the 10722 `create_transcript_view_from_column_format` function is used to specify the case transformation 10723 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10724 to convert the column names to uppercase or lowercase, respectively 10725 :type column_case: str 10726 :return: The `create_transcript_view_from_column_format` function returns two lists: 10727 `temporary_tables` and `annotation_fields`. 10728 """ 10729 10730 log.debug("Start transcrpts view creation from column format...") 10731 10732 # "from_column_format": [ 10733 # { 10734 # "transcripts_column": "ANN", 10735 # "transcripts_infos_column": "Feature_ID", 10736 # } 10737 # ], 10738 10739 # Init 10740 if temporary_tables is None: 10741 temporary_tables = [] 10742 if annotation_fields is None: 10743 annotation_fields = [] 10744 10745 for column_format in column_formats: 10746 10747 # annotation field and transcript annotation field 10748 annotation_field = column_format.get("transcripts_column", "ANN") 10749 transcript_annotation = column_format.get( 10750 "transcripts_infos_column", "Feature_ID" 10751 ) 10752 10753 # Transcripts infos columns rename 10754 column_rename = column_format.get("column_rename", column_rename) 10755 10756 # Transcripts infos columns clean 10757 column_clean = column_format.get("column_clean", column_clean) 10758 10759 # Transcripts infos columns case 10760 column_case = column_format.get("column_case", column_case) 10761 10762 # Temporary View name 10763 temporary_view_name = transcripts_table + "".join( 10764 random.choices(string.ascii_uppercase + string.digits, k=10) 10765 ) 10766 10767 # Create temporary view name 10768 temporary_view_name = self.annotation_format_to_table( 10769 uniquify=True, 10770 annotation_field=annotation_field, 10771 view_name=temporary_view_name, 10772 annotation_id=transcript_annotation, 10773 column_rename=column_rename, 10774 column_clean=column_clean, 10775 column_case=column_case, 10776 ) 10777 10778 # Annotation fields 10779 if temporary_view_name: 10780 query_annotation_fields = f""" 10781 SELECT * 10782 FROM ( 10783 DESCRIBE SELECT * 10784 FROM {temporary_view_name} 10785 ) 10786 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10787 """ 10788 df_annotation_fields = self.get_query_to_df( 10789 query=query_annotation_fields 10790 ) 10791 10792 # Add temporary view and annotation fields 10793 temporary_tables.append(temporary_view_name) 10794 annotation_fields += list(set(df_annotation_fields["column_name"])) 10795 10796 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10798 def create_transcript_view( 10799 self, 10800 transcripts_table: str = None, 10801 transcripts_table_drop: bool = False, 10802 param: dict = {}, 10803 ) -> str: 10804 """ 10805 The `create_transcript_view` function generates a transcript view by processing data from a 10806 specified table based on provided parameters and structural information. 10807 10808 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10809 is used to specify the name of the table that will store the final transcript view data. If a table 10810 name is not provided, the function will create a new table to store the transcript view data, and by 10811 default,, defaults to transcripts 10812 :type transcripts_table: str (optional) 10813 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10814 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10815 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10816 the function will drop the existing transcripts table if it exists, defaults to False 10817 :type transcripts_table_drop: bool (optional) 10818 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10819 contains information needed to create a transcript view. It includes details such as the structure 10820 of the transcripts, columns mapping, column formats, and other necessary information for generating 10821 the view. This parameter allows for flexibility and customization 10822 :type param: dict 10823 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10824 created or modified during the execution of the function. 10825 """ 10826 10827 log.debug("Start transcripts view creation...") 10828 10829 # Default 10830 transcripts_table_default = "transcripts" 10831 10832 # Param 10833 if not param: 10834 param = self.get_param() 10835 10836 # Struct 10837 struct = param.get("transcripts", {}).get("struct", None) 10838 10839 # Transcript veresion 10840 transcript_id_remove_version = param.get("transcripts", {}).get( 10841 "transcript_id_remove_version", False 10842 ) 10843 10844 # Transcripts mapping 10845 transcript_id_mapping_file = param.get("transcripts", {}).get( 10846 "transcript_id_mapping_file", None 10847 ) 10848 10849 # Transcripts mapping 10850 transcript_id_mapping_force = param.get("transcripts", {}).get( 10851 "transcript_id_mapping_force", None 10852 ) 10853 10854 if struct: 10855 10856 # Transcripts table 10857 if transcripts_table is None: 10858 transcripts_table = param.get("transcripts", {}).get( 10859 "table", transcripts_table_default 10860 ) 10861 10862 # added_columns 10863 added_columns = [] 10864 10865 # Temporary tables 10866 temporary_tables = [] 10867 10868 # Annotation fields 10869 annotation_fields = [] 10870 10871 # from columns map 10872 columns_maps = struct.get("from_columns_map", []) 10873 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10874 self.create_transcript_view_from_columns_map( 10875 transcripts_table=transcripts_table, 10876 columns_maps=columns_maps, 10877 added_columns=added_columns, 10878 temporary_tables=temporary_tables, 10879 annotation_fields=annotation_fields, 10880 ) 10881 ) 10882 added_columns += added_columns_tmp 10883 temporary_tables += temporary_tables_tmp 10884 annotation_fields += annotation_fields_tmp 10885 10886 # from column format 10887 column_formats = struct.get("from_column_format", []) 10888 temporary_tables_tmp, annotation_fields_tmp = ( 10889 self.create_transcript_view_from_column_format( 10890 transcripts_table=transcripts_table, 10891 column_formats=column_formats, 10892 temporary_tables=temporary_tables, 10893 annotation_fields=annotation_fields, 10894 ) 10895 ) 10896 temporary_tables += temporary_tables_tmp 10897 annotation_fields += annotation_fields_tmp 10898 10899 # Remove some specific fields/column 10900 annotation_fields = list(set(annotation_fields)) 10901 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10902 if field in annotation_fields: 10903 annotation_fields.remove(field) 10904 10905 # Merge temporary tables query 10906 query_merge = "" 10907 for temporary_table in list(set(temporary_tables)): 10908 10909 # First temporary table 10910 if not query_merge: 10911 query_merge = f""" 10912 SELECT * FROM {temporary_table} 10913 """ 10914 # other temporary table (using UNION) 10915 else: 10916 query_merge += f""" 10917 UNION BY NAME SELECT * FROM {temporary_table} 10918 """ 10919 10920 # transcript table tmp 10921 transcript_table_tmp = "transcripts_tmp" 10922 transcript_table_tmp2 = "transcripts_tmp2" 10923 transcript_table_tmp3 = "transcripts_tmp3" 10924 10925 # Merge on transcript 10926 query_merge_on_transcripts_annotation_fields = [] 10927 10928 # Add transcript list 10929 query_merge_on_transcripts_annotation_fields.append( 10930 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10931 ) 10932 10933 # Aggregate all annotations fields 10934 for annotation_field in set(annotation_fields): 10935 query_merge_on_transcripts_annotation_fields.append( 10936 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10937 ) 10938 10939 # Transcripts mapping 10940 if transcript_id_mapping_file: 10941 10942 # Transcript dataframe 10943 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10944 transcript_id_mapping_dataframe = transcripts_file_to_df( 10945 transcript_id_mapping_file, column_names=["transcript", "alias"] 10946 ) 10947 10948 # Transcript version remove 10949 if transcript_id_remove_version: 10950 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10951 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10952 query_left_join = f""" 10953 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10954 """ 10955 else: 10956 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10957 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10958 query_left_join = f""" 10959 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10960 """ 10961 10962 # Transcript column for group by merge 10963 query_transcript_merge_group_by = """ 10964 CASE 10965 WHEN transcript_mapped NOT IN ('') 10966 THEN split_part(transcript_mapped, '.', 1) 10967 ELSE split_part(transcript_original, '.', 1) 10968 END 10969 """ 10970 10971 # Merge query 10972 transcripts_tmp2_query = f""" 10973 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10974 FROM ({query_merge}) AS {transcript_table_tmp} 10975 {query_left_join} 10976 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10977 """ 10978 10979 # Retrive columns after mege 10980 transcripts_tmp2_describe_query = f""" 10981 DESCRIBE {transcripts_tmp2_query} 10982 """ 10983 transcripts_tmp2_describe_list = list( 10984 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10985 "column_name" 10986 ] 10987 ) 10988 10989 # Create list of columns for select clause 10990 transcripts_tmp2_describe_select_clause = [] 10991 for field in transcripts_tmp2_describe_list: 10992 if field not in [ 10993 "#CHROM", 10994 "POS", 10995 "REF", 10996 "ALT", 10997 "INFO", 10998 "transcript_mapped", 10999 ]: 11000 as_field = field 11001 if field in ["transcript_original"]: 11002 as_field = "transcripts_mapped" 11003 transcripts_tmp2_describe_select_clause.append( 11004 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11005 ) 11006 11007 # Merge with mapping 11008 query_merge_on_transcripts = f""" 11009 SELECT 11010 "#CHROM", POS, REF, ALT, INFO, 11011 CASE 11012 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11013 THEN ANY_VALUE(transcript_mapped) 11014 ELSE ANY_VALUE(transcript_original) 11015 END AS transcript, 11016 {", ".join(transcripts_tmp2_describe_select_clause)} 11017 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11018 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11019 {query_transcript_merge_group_by} 11020 """ 11021 11022 # Add transcript filter from mapping file 11023 if transcript_id_mapping_force: 11024 query_merge_on_transcripts = f""" 11025 SELECT * 11026 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11027 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11028 """ 11029 11030 # No transcript mapping 11031 else: 11032 11033 # Remove transcript version 11034 if transcript_id_remove_version: 11035 query_transcript_column = f""" 11036 split_part({transcript_table_tmp}.transcript, '.', 1) 11037 """ 11038 else: 11039 query_transcript_column = """ 11040 transcript 11041 """ 11042 11043 # Query sections 11044 query_transcript_column_select = ( 11045 f"{query_transcript_column} AS transcript" 11046 ) 11047 query_transcript_column_group_by = query_transcript_column 11048 11049 # Query for transcripts view 11050 query_merge_on_transcripts = f""" 11051 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11052 FROM ({query_merge}) AS {transcript_table_tmp} 11053 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11054 """ 11055 11056 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11057 11058 # Drop transcript view is necessary 11059 if transcripts_table_drop: 11060 query_drop = f""" 11061 DROP TABLE IF EXISTS {transcripts_table}; 11062 """ 11063 self.execute_query(query=query_drop) 11064 11065 # Merge and create transcript view 11066 query_create_view = f""" 11067 CREATE TABLE IF NOT EXISTS {transcripts_table} 11068 AS {query_merge_on_transcripts} 11069 """ 11070 self.execute_query(query=query_create_view) 11071 11072 # Remove added columns 11073 for added_column in added_columns: 11074 self.drop_column(column=added_column) 11075 11076 else: 11077 11078 transcripts_table = None 11079 11080 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to False - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11082 def annotation_format_to_table( 11083 self, 11084 uniquify: bool = True, 11085 annotation_field: str = "ANN", 11086 annotation_id: str = "Feature_ID", 11087 view_name: str = "transcripts", 11088 column_rename: dict = {}, 11089 column_clean: bool = False, 11090 column_case: str = None, 11091 ) -> str: 11092 """ 11093 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11094 structured table format, ensuring unique values and creating a temporary table for further 11095 processing or analysis. 11096 11097 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11098 unique values in the output or not. If set to `True`, the function will make sure that the 11099 output values are unique, defaults to True 11100 :type uniquify: bool (optional) 11101 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11102 that contains the annotation information for each variant. This field is used to extract the 11103 annotation details for further processing in the function. By default, it is set to "ANN", 11104 defaults to ANN 11105 :type annotation_field: str (optional) 11106 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11107 is used to specify the identifier for the annotation feature. This identifier will be used as a 11108 column name in the resulting table or view that is created based on the annotation data. It 11109 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11110 :type annotation_id: str (optional) 11111 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11112 to specify the name of the temporary table that will be created to store the transformed 11113 annotation data. This table will hold the extracted information from the annotation field in a 11114 structured format for further processing or analysis. By default,, defaults to transcripts 11115 :type view_name: str (optional) 11116 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11117 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11118 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11119 created based on the annotation data. This feature enables 11120 :type column_rename: dict 11121 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11122 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11123 If set to `True`, the function will clean the annotation field before further processing. This 11124 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11125 to False 11126 :type column_clean: bool (optional) 11127 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11128 used to specify the case transformation to be applied to the column names extracted from the 11129 annotation data. It allows you to set the case of the column names to either lowercase or 11130 uppercase for consistency or other specific requirements during the conversion 11131 :type column_case: str 11132 :return: The function `annotation_format_to_table` is returning the name of the view created, 11133 which is stored in the variable `view_name`. 11134 """ 11135 11136 # Annotation field 11137 annotation_format = "annotation_explode" 11138 11139 # Transcript annotation 11140 if column_rename: 11141 annotation_id = column_rename.get(annotation_id, annotation_id) 11142 11143 if column_clean: 11144 annotation_id = clean_annotation_field(annotation_id) 11145 11146 # Prefix 11147 prefix = self.get_explode_infos_prefix() 11148 if prefix: 11149 prefix = "INFO/" 11150 11151 # Annotation fields 11152 annotation_infos = prefix + annotation_field 11153 annotation_format_infos = prefix + annotation_format 11154 11155 # Variants table 11156 table_variants = self.get_table_variants() 11157 11158 # Header 11159 vcf_reader = self.get_header() 11160 11161 # Add columns 11162 added_columns = [] 11163 11164 # Explode HGVS field in column 11165 added_columns += self.explode_infos(fields=[annotation_field]) 11166 11167 if annotation_field in vcf_reader.infos: 11168 11169 # Extract ANN header 11170 ann_description = vcf_reader.infos[annotation_field].desc 11171 pattern = r"'(.+?)'" 11172 match = re.search(pattern, ann_description) 11173 if match: 11174 ann_header_match = match.group(1).split(" | ") 11175 ann_header = [] 11176 ann_header_desc = {} 11177 for i in range(len(ann_header_match)): 11178 ann_header_info = "".join( 11179 char for char in ann_header_match[i] if char.isalnum() 11180 ) 11181 ann_header.append(ann_header_info) 11182 ann_header_desc[ann_header_info] = ann_header_match[i] 11183 if not ann_header_desc: 11184 raise ValueError("Invalid header description format") 11185 else: 11186 raise ValueError("Invalid header description format") 11187 11188 # Create variant id 11189 variant_id_column = self.get_variant_id_column() 11190 added_columns += [variant_id_column] 11191 11192 # Create dataframe 11193 dataframe_annotation_format = self.get_query_to_df( 11194 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11195 ) 11196 11197 # Create annotation columns 11198 dataframe_annotation_format[ 11199 annotation_format_infos 11200 ] = dataframe_annotation_format[annotation_infos].apply( 11201 lambda x: explode_annotation_format( 11202 annotation=str(x), 11203 uniquify=uniquify, 11204 output_format="JSON", 11205 prefix="", 11206 header=list(ann_header_desc.values()), 11207 ) 11208 ) 11209 11210 # Find keys 11211 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11212 df_keys = self.get_query_to_df(query=query_json) 11213 11214 # Check keys 11215 query_json_key = [] 11216 for _, row in df_keys.iterrows(): 11217 11218 # Key 11219 key = row.iloc[0] 11220 key_clean = key 11221 11222 # key rename 11223 if column_rename: 11224 key_clean = column_rename.get(key_clean, key_clean) 11225 11226 # key clean 11227 if column_clean: 11228 key_clean = clean_annotation_field(key_clean) 11229 11230 # Key case 11231 if column_case: 11232 if column_case.lower() in ["lower"]: 11233 key_clean = key_clean.lower() 11234 elif column_case.lower() in ["upper"]: 11235 key_clean = key_clean.upper() 11236 11237 # Type 11238 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11239 11240 # Get DataFrame from query 11241 df_json_type = self.get_query_to_df(query=query_json_type) 11242 11243 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11244 with pd.option_context("future.no_silent_downcasting", True): 11245 df_json_type.fillna(value="", inplace=True) 11246 replace_dict = {None: np.nan, "": np.nan} 11247 df_json_type.replace(replace_dict, inplace=True) 11248 df_json_type.dropna(inplace=True) 11249 11250 # Detect column type 11251 column_type = detect_column_type(df_json_type[key_clean]) 11252 11253 # Append 11254 query_json_key.append( 11255 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11256 ) 11257 11258 # Create view 11259 query_view = f""" 11260 CREATE TEMPORARY TABLE {view_name} 11261 AS ( 11262 SELECT *, {annotation_id} AS 'transcript' 11263 FROM ( 11264 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11265 FROM dataframe_annotation_format 11266 ) 11267 ); 11268 """ 11269 self.execute_query(query=query_view) 11270 11271 else: 11272 11273 # Return None 11274 view_name = None 11275 11276 # Remove added columns 11277 for added_column in added_columns: 11278 self.drop_column(column=added_column) 11279 11280 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11282 def transcript_view_to_variants( 11283 self, 11284 transcripts_table: str = None, 11285 transcripts_column_id: str = None, 11286 transcripts_info_json: str = None, 11287 transcripts_info_field_json: str = None, 11288 transcripts_info_format: str = None, 11289 transcripts_info_field_format: str = None, 11290 param: dict = {}, 11291 ) -> bool: 11292 """ 11293 The `transcript_view_to_variants` function updates a variants table with information from 11294 transcripts in JSON format. 11295 11296 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11297 table containing the transcripts data. If this parameter is not provided, the function will 11298 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11299 :type transcripts_table: str 11300 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11301 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11302 identifier is used to match transcripts with variants in the database 11303 :type transcripts_column_id: str 11304 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11305 of the column in the variants table where the transcripts information will be stored in JSON 11306 format. This parameter allows you to define the column in the variants table that will hold the 11307 JSON-formatted information about transcripts 11308 :type transcripts_info_json: str 11309 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11310 specify the field in the VCF header that will contain information about transcripts in JSON 11311 format. This field will be added to the VCF header as an INFO field with the specified name 11312 :type transcripts_info_field_json: str 11313 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11314 format of the information about transcripts that will be stored in the variants table. This 11315 format can be used to define how the transcript information will be structured or displayed 11316 within the variants table 11317 :type transcripts_info_format: str 11318 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11319 specify the field in the VCF header that will contain information about transcripts in a 11320 specific format. This field will be added to the VCF header as an INFO field with the specified 11321 name 11322 :type transcripts_info_field_format: str 11323 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11324 that contains various configuration settings related to transcripts. It is used to provide 11325 default values for certain parameters if they are not explicitly provided when calling the 11326 method. The `param` dictionary can be passed as an argument 11327 :type param: dict 11328 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11329 if the operation is successful and `False` if certain conditions are not met. 11330 """ 11331 11332 msg_info_prefix = "Start transcripts view to variants annotations" 11333 11334 log.debug(f"{msg_info_prefix}...") 11335 11336 # Default 11337 transcripts_table_default = "transcripts" 11338 transcripts_column_id_default = "transcript" 11339 transcripts_info_json_default = None 11340 transcripts_info_format_default = None 11341 transcripts_info_field_json_default = None 11342 transcripts_info_field_format_default = None 11343 11344 # Param 11345 if not param: 11346 param = self.get_param() 11347 11348 # Transcripts table 11349 if transcripts_table is None: 11350 transcripts_table = param.get("transcripts", {}).get( 11351 "table", transcripts_table_default 11352 ) 11353 11354 # Transcripts column ID 11355 if transcripts_column_id is None: 11356 transcripts_column_id = param.get("transcripts", {}).get( 11357 "column_id", transcripts_column_id_default 11358 ) 11359 11360 # Transcripts info json 11361 if transcripts_info_json is None: 11362 transcripts_info_json = param.get("transcripts", {}).get( 11363 "transcripts_info_json", transcripts_info_json_default 11364 ) 11365 11366 # Transcripts info field JSON 11367 if transcripts_info_field_json is None: 11368 transcripts_info_field_json = param.get("transcripts", {}).get( 11369 "transcripts_info_field_json", transcripts_info_field_json_default 11370 ) 11371 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11372 # transcripts_info_json = transcripts_info_field_json 11373 11374 # Transcripts info format 11375 if transcripts_info_format is None: 11376 transcripts_info_format = param.get("transcripts", {}).get( 11377 "transcripts_info_format", transcripts_info_format_default 11378 ) 11379 11380 # Transcripts info field FORMAT 11381 if transcripts_info_field_format is None: 11382 transcripts_info_field_format = param.get("transcripts", {}).get( 11383 "transcripts_info_field_format", transcripts_info_field_format_default 11384 ) 11385 # if ( 11386 # transcripts_info_field_format is not None 11387 # and transcripts_info_format is None 11388 # ): 11389 # transcripts_info_format = transcripts_info_field_format 11390 11391 # Variants table 11392 table_variants = self.get_table_variants() 11393 11394 # Check info columns param 11395 if ( 11396 transcripts_info_json is None 11397 and transcripts_info_field_json is None 11398 and transcripts_info_format is None 11399 and transcripts_info_field_format is None 11400 ): 11401 return False 11402 11403 # Transcripts infos columns 11404 query_transcripts_infos_columns = f""" 11405 SELECT * 11406 FROM ( 11407 DESCRIBE SELECT * FROM {transcripts_table} 11408 ) 11409 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11410 """ 11411 transcripts_infos_columns = list( 11412 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11413 ) 11414 11415 # View results 11416 clause_select = [] 11417 clause_to_json = [] 11418 clause_to_format = [] 11419 for field in transcripts_infos_columns: 11420 # Do not consider INFO field for export into fields 11421 if field not in ["INFO"]: 11422 clause_select.append( 11423 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11424 ) 11425 clause_to_json.append(f""" '{field}': "{field}" """) 11426 clause_to_format.append(f""" "{field}" """) 11427 11428 # Update 11429 update_set_json = [] 11430 update_set_format = [] 11431 11432 # VCF header 11433 vcf_reader = self.get_header() 11434 11435 # Transcripts to info column in JSON 11436 if transcripts_info_json: 11437 11438 # Create column on variants table 11439 self.add_column( 11440 table_name=table_variants, 11441 column_name=transcripts_info_json, 11442 column_type="JSON", 11443 default_value=None, 11444 drop=False, 11445 ) 11446 11447 # Add header 11448 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11449 transcripts_info_json, 11450 ".", 11451 "String", 11452 "Transcripts in JSON format", 11453 "unknwon", 11454 "unknwon", 11455 self.code_type_map["String"], 11456 ) 11457 11458 # Add to update 11459 update_set_json.append( 11460 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11461 ) 11462 11463 # Transcripts to info field in JSON 11464 if transcripts_info_field_json: 11465 11466 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11467 11468 # Add to update 11469 update_set_json.append( 11470 f""" 11471 INFO = concat( 11472 CASE 11473 WHEN INFO NOT IN ('', '.') 11474 THEN INFO 11475 ELSE '' 11476 END, 11477 CASE 11478 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11479 THEN concat( 11480 ';{transcripts_info_field_json}=', 11481 t.{transcripts_info_json} 11482 ) 11483 ELSE '' 11484 END 11485 ) 11486 """ 11487 ) 11488 11489 # Add header 11490 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11491 transcripts_info_field_json, 11492 ".", 11493 "String", 11494 "Transcripts in JSON format", 11495 "unknwon", 11496 "unknwon", 11497 self.code_type_map["String"], 11498 ) 11499 11500 if update_set_json: 11501 11502 # Update query 11503 query_update = f""" 11504 UPDATE {table_variants} 11505 SET {", ".join(update_set_json)} 11506 FROM 11507 ( 11508 SELECT 11509 "#CHROM", POS, REF, ALT, 11510 concat( 11511 '{{', 11512 string_agg( 11513 '"' || "{transcripts_column_id}" || '":' || 11514 to_json(json_output) 11515 ), 11516 '}}' 11517 )::JSON AS {transcripts_info_json} 11518 FROM 11519 ( 11520 SELECT 11521 "#CHROM", POS, REF, ALT, 11522 "{transcripts_column_id}", 11523 to_json( 11524 {{{",".join(clause_to_json)}}} 11525 )::JSON AS json_output 11526 FROM 11527 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11528 WHERE "{transcripts_column_id}" IS NOT NULL 11529 ) 11530 GROUP BY "#CHROM", POS, REF, ALT 11531 ) AS t 11532 WHERE {table_variants}."#CHROM" = t."#CHROM" 11533 AND {table_variants}."POS" = t."POS" 11534 AND {table_variants}."REF" = t."REF" 11535 AND {table_variants}."ALT" = t."ALT" 11536 """ 11537 11538 self.execute_query(query=query_update) 11539 11540 # Transcripts to info column in FORMAT 11541 if transcripts_info_format: 11542 11543 # Create column on variants table 11544 self.add_column( 11545 table_name=table_variants, 11546 column_name=transcripts_info_format, 11547 column_type="VARCHAR", 11548 default_value=None, 11549 drop=False, 11550 ) 11551 11552 # Add header 11553 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11554 transcripts_info_format, 11555 ".", 11556 "String", 11557 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11558 "unknwon", 11559 "unknwon", 11560 self.code_type_map["String"], 11561 ) 11562 11563 # Add to update 11564 update_set_format.append( 11565 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11566 ) 11567 11568 else: 11569 11570 # Set variable for internal queries 11571 transcripts_info_format = "transcripts_info_format" 11572 11573 # Transcripts to info field in JSON 11574 if transcripts_info_field_format: 11575 11576 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11577 11578 # Add to update 11579 update_set_format.append( 11580 f""" 11581 INFO = concat( 11582 CASE 11583 WHEN INFO NOT IN ('', '.') 11584 THEN INFO 11585 ELSE '' 11586 END, 11587 CASE 11588 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11589 THEN concat( 11590 ';{transcripts_info_field_format}=', 11591 t.{transcripts_info_format} 11592 ) 11593 ELSE '' 11594 END 11595 ) 11596 """ 11597 ) 11598 11599 # Add header 11600 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11601 transcripts_info_field_format, 11602 ".", 11603 "String", 11604 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11605 "unknwon", 11606 "unknwon", 11607 self.code_type_map["String"], 11608 ) 11609 11610 if update_set_format: 11611 11612 # Update query 11613 query_update = f""" 11614 UPDATE {table_variants} 11615 SET {", ".join(update_set_format)} 11616 FROM 11617 ( 11618 SELECT 11619 "#CHROM", POS, REF, ALT, 11620 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11621 FROM 11622 ( 11623 SELECT 11624 "#CHROM", POS, REF, ALT, 11625 "{transcripts_column_id}", 11626 concat( 11627 "{transcripts_column_id}", 11628 '|', 11629 {", '|', ".join(clause_to_format)} 11630 ) AS {transcripts_info_format} 11631 FROM 11632 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11633 ) 11634 GROUP BY "#CHROM", POS, REF, ALT 11635 ) AS t 11636 WHERE {table_variants}."#CHROM" = t."#CHROM" 11637 AND {table_variants}."POS" = t."POS" 11638 AND {table_variants}."REF" = t."REF" 11639 AND {table_variants}."ALT" = t."ALT" 11640 """ 11641 11642 self.execute_query(query=query_update) 11643 11644 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.
11646 def rename_info_fields( 11647 self, fields_to_rename: dict = None, table: str = None 11648 ) -> dict: 11649 """ 11650 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11651 corresponding INFO fields in the variants table. 11652 11653 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11654 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11655 represent the original field names that need to be renamed, and the corresponding values 11656 represent the new names to which the fields should be 11657 :type fields_to_rename: dict 11658 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11659 the table in which the variants data is stored. This table contains information about genetic 11660 variants, and the function updates the corresponding INFO fields in this table when renaming 11661 specified fields in the VCF file header 11662 :type table: str 11663 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11664 the original field names as keys and their corresponding new names (or None if the field was 11665 removed) as values after renaming or removing specified fields in a VCF file header and updating 11666 corresponding INFO fields in the variants table. 11667 """ 11668 11669 # Init 11670 fields_renamed = {} 11671 config = self.get_config() 11672 access = config.get("access") 11673 11674 if table is None: 11675 table = self.get_table_variants() 11676 11677 if fields_to_rename is not None and access not in ["RO"]: 11678 11679 log.info("Rename or remove fields...") 11680 11681 # Header 11682 header = self.get_header() 11683 11684 for field_to_rename, field_renamed in fields_to_rename.items(): 11685 11686 if field_to_rename in header.infos: 11687 11688 # Rename header 11689 if field_renamed is not None: 11690 header.infos[field_renamed] = vcf.parser._Info( 11691 field_renamed, 11692 header.infos[field_to_rename].num, 11693 header.infos[field_to_rename].type, 11694 header.infos[field_to_rename].desc, 11695 header.infos[field_to_rename].source, 11696 header.infos[field_to_rename].version, 11697 header.infos[field_to_rename].type_code, 11698 ) 11699 del header.infos[field_to_rename] 11700 11701 # Rename INFO patterns 11702 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11703 if field_renamed is not None: 11704 field_renamed_pattern = rf'\1{field_renamed}=\3' 11705 else: 11706 field_renamed_pattern = '' 11707 11708 # Rename INFO 11709 query = f""" 11710 UPDATE {table} 11711 SET 11712 INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g') 11713 """ 11714 self.execute_query(query=query) 11715 11716 # Return 11717 fields_renamed[field_to_rename] = field_renamed 11718 11719 # Log 11720 if field_renamed is not None: 11721 log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'") 11722 else: 11723 log.info(f"Rename or remove fields: field '{field_to_rename}' removed") 11724 11725 return fields_renamed
The rename_info_fields function renames specified fields in a VCF file header and updates
corresponding INFO fields in the variants table.
Parameters
- fields_to_rename: The
fields_to_renameparameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - table: The
tableparameter in therename_info_fieldsfunction represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns
The
rename_info_fieldsfunction returns a dictionaryfields_renamedthat contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.
11727 def calculation_rename_info_fields( 11728 self, 11729 fields_to_rename: dict = None, 11730 table: str = None, 11731 operation_name: str = "RENAME_INFO_FIELDS", 11732 ) -> None: 11733 """ 11734 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11735 fields to rename and table if provided, and then calls another function to rename the fields. 11736 11737 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11738 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11739 the key and the new field name as the value 11740 :type fields_to_rename: dict 11741 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11742 specify the name of the table for which the fields are to be renamed. It is a string type 11743 parameter 11744 :type table: str 11745 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11746 method is a string that specifies the name of the operation being performed. In this context, it 11747 is used as a default value for the operation name if not explicitly provided when calling the 11748 function, defaults to RENAME_INFO_FIELDS 11749 :type operation_name: str (optional) 11750 """ 11751 11752 # Param 11753 param = self.get_param() 11754 11755 # Get param fields to rename 11756 param_fields_to_rename = ( 11757 param.get("calculation", {}) 11758 .get("calculations", {}) 11759 .get(operation_name, {}) 11760 .get("fields_to_rename", None) 11761 ) 11762 11763 # Get param table 11764 param_table = ( 11765 param.get("calculation", {}) 11766 .get("calculations", {}) 11767 .get(operation_name, {}) 11768 .get("table", None) 11769 ) 11770 11771 # Init fields_to_rename 11772 if fields_to_rename is None: 11773 fields_to_rename = param_fields_to_rename 11774 11775 # Init table 11776 if table is None: 11777 table = param_table 11778 11779 renamed_fields = self.rename_info_fields( 11780 fields_to_rename=fields_to_rename, table=table 11781 ) 11782 11783 log.debug(f"renamed_fields:{renamed_fields}")
The calculation_rename_info_fields function retrieves parameters from a dictionary, updates
fields to rename and table if provided, and then calls another function to rename the fields.
Parameters
- fields_to_rename:
fields_to_renameis a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value - table: The
tableparameter in thecalculation_rename_info_fieldsmethod is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter - operation_name: The
operation_nameparameter in thecalculation_rename_info_fieldsmethod is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS